2802 2799 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 | // SPDX-License-Identifier: GPL-2.0-only /* * rcuref - A scalable reference count implementation for RCU managed objects * * rcuref is provided to replace open coded reference count implementations * based on atomic_t. It protects explicitely RCU managed objects which can * be visible even after the last reference has been dropped and the object * is heading towards destruction. * * A common usage pattern is: * * get() * rcu_read_lock(); * p = get_ptr(); * if (p && !atomic_inc_not_zero(&p->refcnt)) * p = NULL; * rcu_read_unlock(); * return p; * * put() * if (!atomic_dec_return(&->refcnt)) { * remove_ptr(p); * kfree_rcu((p, rcu); * } * * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has * O(N^2) behaviour under contention with N concurrent operations. * * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales * better under contention. * * Why not refcount? * ================= * * In principle it should be possible to make refcount use the rcuref * scheme, but the destruction race described below cannot be prevented * unless the protected object is RCU managed. * * Theory of operation * =================== * * rcuref uses an unsigned integer reference counter. As long as the * counter value is greater than or equal to RCUREF_ONEREF and not larger * than RCUREF_MAXREF the reference is alive: * * ONEREF MAXREF SATURATED RELEASED DEAD NOREF * 0 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF * <---valid --------> <-------saturation zone-------> <-----dead zone-----> * * The get() and put() operations do unconditional increments and * decrements. The result is checked after the operation. This optimizes * for the fast path. * * If the reference count is saturated or dead, then the increments and * decrements are not harmful as the reference count still stays in the * respective zones and is always set back to STATURATED resp. DEAD. The * zones have room for 2^28 racing operations in each direction, which * makes it practically impossible to escape the zones. * * Once the last reference is dropped the reference count becomes * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The * slowpath then tries to set the reference count from RCUREF_NOREF to * RCUREF_DEAD via a cmpxchg(). This opens a small window where a * concurrent rcuref_get() can acquire the reference count and bring it * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD. * * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in * DEAD + 1, which is inside the dead zone. If that happens the reference * count is put back to DEAD. * * The actual race is possible due to the unconditional increment and * decrements in rcuref_get() and rcuref_put(): * * T1 T2 * get() put() * if (atomic_add_negative(-1, &ref->refcnt)) * succeeds-> atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); * * atomic_add_negative(1, &ref->refcnt); <- Elevates refcount to DEAD + 1 * * As the result of T1's add is negative, the get() goes into the slow path * and observes refcnt being in the dead zone which makes the operation fail. * * Possible critical states: * * Context Counter References Operation * T1 0 1 init() * T2 1 2 get() * T1 0 1 put() * T2 -1 0 put() tries to mark dead * T1 0 1 get() * T2 0 1 put() mark dead fails * T1 -1 0 put() tries to mark dead * T1 DEAD 0 put() mark dead succeeds * T2 DEAD+1 0 get() fails and puts it back to DEAD * * Of course there are more complex scenarios, but the above illustrates * the working principle. The rest is left to the imagination of the * reader. * * Deconstruction race * =================== * * The release operation must be protected by prohibiting a grace period in * order to prevent a possible use after free: * * T1 T2 * put() get() * // ref->refcnt = ONEREF * if (!atomic_add_negative(-1, &ref->refcnt)) * return false; <- Not taken * * // ref->refcnt == NOREF * --> preemption * // Elevates ref->refcnt to ONEREF * if (!atomic_add_negative(1, &ref->refcnt)) * return true; <- taken * * if (put(&p->ref)) { <-- Succeeds * remove_pointer(p); * kfree_rcu(p, rcu); * } * * RCU grace period ends, object is freed * * atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF * * This is prevented by disabling preemption around the put() operation as * that's in most kernel configurations cheaper than a rcu_read_lock() / * rcu_read_unlock() pair and in many cases even a NOOP. In any case it * prevents the grace period which keeps the object alive until all put() * operations complete. * * Saturation protection * ===================== * * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX). * Once this is exceedded the reference count becomes stale by setting it * to RCUREF_SATURATED, which will cause a memory leak, but it prevents * wrap arounds which obviously cause worse problems than a memory * leak. When saturation is reached a warning is emitted. * * Race conditions * =============== * * All reference count increment/decrement operations are unconditional and * only verified after the fact. This optimizes for the good case and takes * the occasional race vs. a dead or already saturated refcount into * account. The saturation and dead zones are large enough to accomodate * for that. * * Memory ordering * =============== * * Memory ordering rules are slightly relaxed wrt regular atomic_t functions * and provide only what is strictly required for refcounts. * * The increments are fully relaxed; these will not provide ordering. The * rationale is that whatever is used to obtain the object to increase the * reference count on will provide the ordering. For locked data * structures, its the lock acquire, for RCU/lockless data structures its * the dependent load. * * rcuref_get() provides a control dependency ordering future stores which * ensures that the object is not modified when acquiring a reference * fails. * * rcuref_put() provides release order, i.e. all prior loads and stores * will be issued before. It also provides a control dependency ordering * against the subsequent destruction of the object. * * If rcuref_put() successfully dropped the last reference and marked the * object DEAD it also provides acquire ordering. */ #include <linux/export.h> #include <linux/rcuref.h> /** * rcuref_get_slowpath - Slowpath of rcuref_get() * @ref: Pointer to the reference count * * Invoked when the reference count is outside of the valid zone. * * Return: * False if the reference count was already marked dead * * True if the reference count is saturated, which prevents the * object from being deconstructed ever. */ bool rcuref_get_slowpath(rcuref_t *ref) { unsigned int cnt = atomic_read(&ref->refcnt); /* * If the reference count was already marked dead, undo the * increment so it stays in the middle of the dead zone and return * fail. */ if (cnt >= RCUREF_RELEASED) { atomic_set(&ref->refcnt, RCUREF_DEAD); return false; } /* * If it was saturated, warn and mark it so. In case the increment * was already on a saturated value restore the saturation * marker. This keeps it in the middle of the saturation zone and * prevents the reference count from overflowing. This leaks the * object memory, but prevents the obvious reference count overflow * damage. */ if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory")) atomic_set(&ref->refcnt, RCUREF_SATURATED); return true; } EXPORT_SYMBOL_GPL(rcuref_get_slowpath); /** * rcuref_put_slowpath - Slowpath of __rcuref_put() * @ref: Pointer to the reference count * @cnt: The resulting value of the fastpath decrement * * Invoked when the reference count is outside of the valid zone. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt) { /* Did this drop the last reference? */ if (likely(cnt == RCUREF_NOREF)) { /* * Carefully try to set the reference count to RCUREF_DEAD. * * This can fail if a concurrent get() operation has * elevated it again or the corresponding put() even marked * it dead already. Both are valid situations and do not * require a retry. If this fails the caller is not * allowed to deconstruct the object. */ if (!atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD)) return false; /* * The caller can safely schedule the object for * deconstruction. Provide acquire ordering. */ smp_acquire__after_ctrl_dep(); return true; } /* * If the reference count was already in the dead zone, then this * put() operation is imbalanced. Warn, put the reference count back to * DEAD and tell the caller to not deconstruct the object. */ if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) { atomic_set(&ref->refcnt, RCUREF_DEAD); return false; } /* * This is a put() operation on a saturated refcount. Restore the * mean saturation value and tell the caller to not deconstruct the * object. */ if (cnt > RCUREF_MAXREF) atomic_set(&ref->refcnt, RCUREF_SATURATED); return false; } EXPORT_SYMBOL_GPL(rcuref_put_slowpath); |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | // SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/mmzone.h> #include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/swap.h> #include <linux/vmstat.h> #include <linux/atomic.h> #include <linux/vmalloc.h> #ifdef CONFIG_CMA #include <linux/cma.h> #endif #include <linux/zswap.h> #include <asm/page.h> #include "internal.h" void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) { } static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) { seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); seq_write(m, " kB\n", 4); } static int meminfo_proc_show(struct seq_file *m, void *v) { struct sysinfo i; unsigned long committed; long cached; long available; unsigned long pages[NR_LRU_LISTS]; unsigned long sreclaimable, sunreclaim; int lru; si_meminfo(&i); si_swapinfo(&i); committed = vm_memory_committed(); cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - i.bufferram; if (cached < 0) cached = 0; for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); show_val_kb(m, "MemTotal: ", i.totalram); show_val_kb(m, "MemFree: ", i.freeram); show_val_kb(m, "MemAvailable: ", available); show_val_kb(m, "Buffers: ", i.bufferram); show_val_kb(m, "Cached: ", cached); show_val_kb(m, "SwapCached: ", total_swapcache_pages()); show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]); show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]); show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); #ifdef CONFIG_HIGHMEM show_val_kb(m, "HighTotal: ", i.totalhigh); show_val_kb(m, "HighFree: ", i.freehigh); show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh); show_val_kb(m, "LowFree: ", i.freeram - i.freehigh); #endif #ifndef CONFIG_MMU show_val_kb(m, "MmapCopy: ", (unsigned long)atomic_long_read(&mmap_pages_allocated)); #endif show_val_kb(m, "SwapTotal: ", i.totalswap); show_val_kb(m, "SwapFree: ", i.freeswap); #ifdef CONFIG_ZSWAP show_val_kb(m, "Zswap: ", zswap_total_pages()); seq_printf(m, "Zswapped: %8lu kB\n", (unsigned long)atomic_long_read(&zswap_stored_pages) << (PAGE_SHIFT - 10)); #endif show_val_kb(m, "Dirty: ", global_node_page_state(NR_FILE_DIRTY)); show_val_kb(m, "Writeback: ", global_node_page_state(NR_WRITEBACK)); show_val_kb(m, "AnonPages: ", global_node_page_state(NR_ANON_MAPPED)); show_val_kb(m, "Mapped: ", global_node_page_state(NR_FILE_MAPPED)); show_val_kb(m, "Shmem: ", i.sharedram); show_val_kb(m, "KReclaimable: ", sreclaimable + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); show_val_kb(m, "SReclaimable: ", sreclaimable); show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_node_page_state(NR_KERNEL_STACK_KB)); #ifdef CONFIG_SHADOW_CALL_STACK seq_printf(m, "ShadowCallStack:%8lu kB\n", global_node_page_state(NR_KERNEL_SCS_KB)); #endif show_val_kb(m, "PageTables: ", global_node_page_state(NR_PAGETABLE)); show_val_kb(m, "SecPageTables: ", global_node_page_state(NR_SECONDARY_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", global_node_page_state(NR_WRITEBACK_TEMP)); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); memtest_report_meminfo(m); #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)); #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE show_val_kb(m, "AnonHugePages: ", global_node_page_state(NR_ANON_THPS)); show_val_kb(m, "ShmemHugePages: ", global_node_page_state(NR_SHMEM_THPS)); show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED)); show_val_kb(m, "FileHugePages: ", global_node_page_state(NR_FILE_THPS)); show_val_kb(m, "FilePmdMapped: ", global_node_page_state(NR_FILE_PMDMAPPED)); #endif #ifdef CONFIG_CMA show_val_kb(m, "CmaTotal: ", totalcma_pages); show_val_kb(m, "CmaFree: ", global_zone_page_state(NR_FREE_CMA_PAGES)); #endif #ifdef CONFIG_UNACCEPTED_MEMORY show_val_kb(m, "Unaccepted: ", global_zone_page_state(NR_UNACCEPTED)); #endif show_val_kb(m, "Balloon: ", global_node_page_state(NR_BALLOON_PAGES)); hugetlb_report_meminfo(m); arch_report_meminfo(m); return 0; } static int __init proc_meminfo_init(void) { struct proc_dir_entry *pde; pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show); pde_make_permanent(pde); return 0; } fs_initcall(proc_meminfo_init); |
89 89 89 88 88 89 89 88 89 89 88 89 88 89 89 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 | // SPDX-License-Identifier: GPL-2.0-or-later /* Verify the signature on a PKCS#7 message. * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "PKCS7: "fmt #include <linux/kernel.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/asn1.h> #include <crypto/hash.h> #include <crypto/hash_info.h> #include <crypto/public_key.h> #include "pkcs7_parser.h" /* * Digest the relevant parts of the PKCS#7 data */ static int pkcs7_digest(struct pkcs7_message *pkcs7, struct pkcs7_signed_info *sinfo) { struct public_key_signature *sig = sinfo->sig; struct crypto_shash *tfm; struct shash_desc *desc; size_t desc_size; int ret; kenter(",%u,%s", sinfo->index, sinfo->sig->hash_algo); /* The digest was calculated already. */ if (sig->digest) return 0; if (!sinfo->sig->hash_algo) return -ENOPKG; /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ tfm = crypto_alloc_shash(sinfo->sig->hash_algo, 0, 0); if (IS_ERR(tfm)) return (PTR_ERR(tfm) == -ENOENT) ? -ENOPKG : PTR_ERR(tfm); desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); sig->digest_size = crypto_shash_digestsize(tfm); ret = -ENOMEM; sig->digest = kmalloc(sig->digest_size, GFP_KERNEL); if (!sig->digest) goto error_no_desc; desc = kzalloc(desc_size, GFP_KERNEL); if (!desc) goto error_no_desc; desc->tfm = tfm; /* Digest the message [RFC2315 9.3] */ ret = crypto_shash_digest(desc, pkcs7->data, pkcs7->data_len, sig->digest); if (ret < 0) goto error; pr_devel("MsgDigest = [%*ph]\n", 8, sig->digest); /* However, if there are authenticated attributes, there must be a * message digest attribute amongst them which corresponds to the * digest we just calculated. */ if (sinfo->authattrs) { u8 tag; if (!sinfo->msgdigest) { pr_warn("Sig %u: No messageDigest\n", sinfo->index); ret = -EKEYREJECTED; goto error; } if (sinfo->msgdigest_len != sig->digest_size) { pr_warn("Sig %u: Invalid digest size (%u)\n", sinfo->index, sinfo->msgdigest_len); ret = -EBADMSG; goto error; } if (memcmp(sig->digest, sinfo->msgdigest, sinfo->msgdigest_len) != 0) { pr_warn("Sig %u: Message digest doesn't match\n", sinfo->index); ret = -EKEYREJECTED; goto error; } /* We then calculate anew, using the authenticated attributes * as the contents of the digest instead. Note that we need to * convert the attributes from a CONT.0 into a SET before we * hash it. */ memset(sig->digest, 0, sig->digest_size); ret = crypto_shash_init(desc); if (ret < 0) goto error; tag = ASN1_CONS_BIT | ASN1_SET; ret = crypto_shash_update(desc, &tag, 1); if (ret < 0) goto error; ret = crypto_shash_finup(desc, sinfo->authattrs, sinfo->authattrs_len, sig->digest); if (ret < 0) goto error; pr_devel("AADigest = [%*ph]\n", 8, sig->digest); } error: kfree(desc); error_no_desc: crypto_free_shash(tfm); kleave(" = %d", ret); return ret; } int pkcs7_get_digest(struct pkcs7_message *pkcs7, const u8 **buf, u32 *len, enum hash_algo *hash_algo) { struct pkcs7_signed_info *sinfo = pkcs7->signed_infos; int i, ret; /* * This function doesn't support messages with more than one signature. */ if (sinfo == NULL || sinfo->next != NULL) return -EBADMSG; ret = pkcs7_digest(pkcs7, sinfo); if (ret) return ret; *buf = sinfo->sig->digest; *len = sinfo->sig->digest_size; i = match_string(hash_algo_name, HASH_ALGO__LAST, sinfo->sig->hash_algo); if (i >= 0) *hash_algo = i; return 0; } /* * Find the key (X.509 certificate) to use to verify a PKCS#7 message. PKCS#7 * uses the issuer's name and the issuing certificate serial number for * matching purposes. These must match the certificate issuer's name (not * subject's name) and the certificate serial number [RFC 2315 6.7]. */ static int pkcs7_find_key(struct pkcs7_message *pkcs7, struct pkcs7_signed_info *sinfo) { struct x509_certificate *x509; unsigned certix = 1; kenter("%u", sinfo->index); for (x509 = pkcs7->certs; x509; x509 = x509->next, certix++) { /* I'm _assuming_ that the generator of the PKCS#7 message will * encode the fields from the X.509 cert in the same way in the * PKCS#7 message - but I can't be 100% sure of that. It's * possible this will need element-by-element comparison. */ if (!asymmetric_key_id_same(x509->id, sinfo->sig->auth_ids[0])) continue; pr_devel("Sig %u: Found cert serial match X.509[%u]\n", sinfo->index, certix); sinfo->signer = x509; return 0; } /* The relevant X.509 cert isn't found here, but it might be found in * the trust keyring. */ pr_debug("Sig %u: Issuing X.509 cert not found (#%*phN)\n", sinfo->index, sinfo->sig->auth_ids[0]->len, sinfo->sig->auth_ids[0]->data); return 0; } /* * Verify the internal certificate chain as best we can. */ static int pkcs7_verify_sig_chain(struct pkcs7_message *pkcs7, struct pkcs7_signed_info *sinfo) { struct public_key_signature *sig; struct x509_certificate *x509 = sinfo->signer, *p; struct asymmetric_key_id *auth; int ret; kenter(""); for (p = pkcs7->certs; p; p = p->next) p->seen = false; for (;;) { pr_debug("verify %s: %*phN\n", x509->subject, x509->raw_serial_size, x509->raw_serial); x509->seen = true; if (x509->blacklisted) { /* If this cert is blacklisted, then mark everything * that depends on this as blacklisted too. */ sinfo->blacklisted = true; for (p = sinfo->signer; p != x509; p = p->signer) p->blacklisted = true; pr_debug("- blacklisted\n"); return 0; } pr_debug("- issuer %s\n", x509->issuer); sig = x509->sig; if (sig->auth_ids[0]) pr_debug("- authkeyid.id %*phN\n", sig->auth_ids[0]->len, sig->auth_ids[0]->data); if (sig->auth_ids[1]) pr_debug("- authkeyid.skid %*phN\n", sig->auth_ids[1]->len, sig->auth_ids[1]->data); if (x509->self_signed) { /* If there's no authority certificate specified, then * the certificate must be self-signed and is the root * of the chain. Likewise if the cert is its own * authority. */ if (x509->unsupported_sig) goto unsupported_sig_in_x509; x509->signer = x509; pr_debug("- self-signed\n"); return 0; } /* Look through the X.509 certificates in the PKCS#7 message's * list to see if the next one is there. */ auth = sig->auth_ids[0]; if (auth) { pr_debug("- want %*phN\n", auth->len, auth->data); for (p = pkcs7->certs; p; p = p->next) { pr_debug("- cmp [%u] %*phN\n", p->index, p->id->len, p->id->data); if (asymmetric_key_id_same(p->id, auth)) goto found_issuer_check_skid; } } else if (sig->auth_ids[1]) { auth = sig->auth_ids[1]; pr_debug("- want %*phN\n", auth->len, auth->data); for (p = pkcs7->certs; p; p = p->next) { if (!p->skid) continue; pr_debug("- cmp [%u] %*phN\n", p->index, p->skid->len, p->skid->data); if (asymmetric_key_id_same(p->skid, auth)) goto found_issuer; } } /* We didn't find the root of this chain */ pr_debug("- top\n"); return 0; found_issuer_check_skid: /* We matched issuer + serialNumber, but if there's an * authKeyId.keyId, that must match the CA subjKeyId also. */ if (sig->auth_ids[1] && !asymmetric_key_id_same(p->skid, sig->auth_ids[1])) { pr_warn("Sig %u: X.509 chain contains auth-skid nonmatch (%u->%u)\n", sinfo->index, x509->index, p->index); return -EKEYREJECTED; } found_issuer: pr_debug("- subject %s\n", p->subject); if (p->seen) { pr_warn("Sig %u: X.509 chain contains loop\n", sinfo->index); return 0; } ret = public_key_verify_signature(p->pub, x509->sig); if (ret < 0) return ret; x509->signer = p; if (x509 == p) { pr_debug("- self-signed\n"); return 0; } x509 = p; might_sleep(); } unsupported_sig_in_x509: /* Just prune the certificate chain at this point if we lack some * crypto module to go further. Note, however, we don't want to set * sinfo->unsupported_crypto as the signed info block may still be * validatable against an X.509 cert lower in the chain that we have a * trusted copy of. */ return 0; } /* * Verify one signed information block from a PKCS#7 message. */ static int pkcs7_verify_one(struct pkcs7_message *pkcs7, struct pkcs7_signed_info *sinfo) { int ret; kenter(",%u", sinfo->index); /* First of all, digest the data in the PKCS#7 message and the * signed information block */ ret = pkcs7_digest(pkcs7, sinfo); if (ret < 0) return ret; /* Find the key for the signature if there is one */ ret = pkcs7_find_key(pkcs7, sinfo); if (ret < 0) return ret; if (!sinfo->signer) return 0; pr_devel("Using X.509[%u] for sig %u\n", sinfo->signer->index, sinfo->index); /* Check that the PKCS#7 signing time is valid according to the X.509 * certificate. We can't, however, check against the system clock * since that may not have been set yet and may be wrong. */ if (test_bit(sinfo_has_signing_time, &sinfo->aa_set)) { if (sinfo->signing_time < sinfo->signer->valid_from || sinfo->signing_time > sinfo->signer->valid_to) { pr_warn("Message signed outside of X.509 validity window\n"); return -EKEYREJECTED; } } /* Verify the PKCS#7 binary against the key */ ret = public_key_verify_signature(sinfo->signer->pub, sinfo->sig); if (ret < 0) return ret; pr_devel("Verified signature %u\n", sinfo->index); /* Verify the internal certificate chain */ return pkcs7_verify_sig_chain(pkcs7, sinfo); } /** * pkcs7_verify - Verify a PKCS#7 message * @pkcs7: The PKCS#7 message to be verified * @usage: The use to which the key is being put * * Verify a PKCS#7 message is internally consistent - that is, the data digest * matches the digest in the AuthAttrs and any signature in the message or one * of the X.509 certificates it carries that matches another X.509 cert in the * message can be verified. * * This does not look to match the contents of the PKCS#7 message against any * external public keys. * * Returns, in order of descending priority: * * (*) -EKEYREJECTED if a key was selected that had a usage restriction at * odds with the specified usage, or: * * (*) -EKEYREJECTED if a signature failed to match for which we found an * appropriate X.509 certificate, or: * * (*) -EBADMSG if some part of the message was invalid, or: * * (*) 0 if a signature chain passed verification, or: * * (*) -EKEYREJECTED if a blacklisted key was encountered, or: * * (*) -ENOPKG if none of the signature chains are verifiable because suitable * crypto modules couldn't be found. */ int pkcs7_verify(struct pkcs7_message *pkcs7, enum key_being_used_for usage) { struct pkcs7_signed_info *sinfo; int actual_ret = -ENOPKG; int ret; kenter(""); switch (usage) { case VERIFYING_MODULE_SIGNATURE: if (pkcs7->data_type != OID_data) { pr_warn("Invalid module sig (not pkcs7-data)\n"); return -EKEYREJECTED; } if (pkcs7->have_authattrs) { pr_warn("Invalid module sig (has authattrs)\n"); return -EKEYREJECTED; } break; case VERIFYING_FIRMWARE_SIGNATURE: if (pkcs7->data_type != OID_data) { pr_warn("Invalid firmware sig (not pkcs7-data)\n"); return -EKEYREJECTED; } if (!pkcs7->have_authattrs) { pr_warn("Invalid firmware sig (missing authattrs)\n"); return -EKEYREJECTED; } break; case VERIFYING_KEXEC_PE_SIGNATURE: if (pkcs7->data_type != OID_msIndirectData) { pr_warn("Invalid kexec sig (not Authenticode)\n"); return -EKEYREJECTED; } /* Authattr presence checked in parser */ break; case VERIFYING_UNSPECIFIED_SIGNATURE: if (pkcs7->data_type != OID_data) { pr_warn("Invalid unspecified sig (not pkcs7-data)\n"); return -EKEYREJECTED; } break; default: return -EINVAL; } for (sinfo = pkcs7->signed_infos; sinfo; sinfo = sinfo->next) { ret = pkcs7_verify_one(pkcs7, sinfo); if (sinfo->blacklisted) { if (actual_ret == -ENOPKG) actual_ret = -EKEYREJECTED; continue; } if (ret < 0) { if (ret == -ENOPKG) { sinfo->unsupported_crypto = true; continue; } kleave(" = %d", ret); return ret; } actual_ret = 0; } kleave(" = %d", actual_ret); return actual_ret; } EXPORT_SYMBOL_GPL(pkcs7_verify); /** * pkcs7_supply_detached_data - Supply the data needed to verify a PKCS#7 message * @pkcs7: The PKCS#7 message * @data: The data to be verified * @datalen: The amount of data * * Supply the detached data needed to verify a PKCS#7 message. Note that no * attempt to retain/pin the data is made. That is left to the caller. The * data will not be modified by pkcs7_verify() and will not be freed when the * PKCS#7 message is freed. * * Returns -EINVAL if data is already supplied in the message, 0 otherwise. */ int pkcs7_supply_detached_data(struct pkcs7_message *pkcs7, const void *data, size_t datalen) { if (pkcs7->data) { pr_warn("Data already supplied\n"); return -EINVAL; } pkcs7->data = data; pkcs7->data_len = datalen; return 0; } EXPORT_SYMBOL_GPL(pkcs7_supply_detached_data); |
65 3 2 2 1 57 18 53 17 56 54 13 13 11 3 54 46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/msync.c * * Copyright (C) 1994-1999 Linus Torvalds */ /* * The msync() system call. */ #include <linux/fs.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/file.h> #include <linux/syscalls.h> #include <linux/sched.h> /* * MS_SYNC syncs the entire file - including mappings. * * MS_ASYNC does not start I/O (it used to, up to 2.5.67). * Nor does it marks the relevant pages dirty (it used to up to 2.6.17). * Now it doesn't do anything, since dirty pages are properly tracked. * * The application may now run fsync() to * write out the dirty pages and wait on the writeout and check the result. * Or the application may run fadvise(FADV_DONTNEED) against the fd to start * async writeout immediately. * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to * applications. */ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) { unsigned long end; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int unmapped_error = 0; int error = -EINVAL; start = untagged_addr(start); if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; if (offset_in_page(start)) goto out; if ((flags & MS_ASYNC) && (flags & MS_SYNC)) goto out; error = -ENOMEM; len = (len + ~PAGE_MASK) & PAGE_MASK; end = start + len; if (end < start) goto out; error = 0; if (end == start) goto out; /* * If the interval [start,end) covers some unmapped address ranges, * just ignore them, but return -ENOMEM at the end. Besides, if the * flag is MS_ASYNC (w/o MS_INVALIDATE) the result would be -ENOMEM * anyway and there is nothing left to do, so return immediately. */ mmap_read_lock(mm); vma = find_vma(mm, start); for (;;) { struct file *file; loff_t fstart, fend; /* Still start < end. */ error = -ENOMEM; if (!vma) goto out_unlock; /* Here start < vma->vm_end. */ if (start < vma->vm_start) { if (flags == MS_ASYNC) goto out_unlock; start = vma->vm_start; if (start >= end) goto out_unlock; unmapped_error = -ENOMEM; } /* Here vma->vm_start <= start < vma->vm_end. */ if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) { error = -EBUSY; goto out_unlock; } file = vma->vm_file; fstart = (start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); fend = fstart + (min(end, vma->vm_end) - start) - 1; start = vma->vm_end; if ((flags & MS_SYNC) && file && (vma->vm_flags & VM_SHARED)) { get_file(file); mmap_read_unlock(mm); error = vfs_fsync_range(file, fstart, fend, 1); fput(file); if (error || start >= end) goto out; mmap_read_lock(mm); vma = find_vma(mm, start); } else { if (start >= end) { error = 0; goto out_unlock; } vma = find_vma(mm, vma->vm_end); } } out_unlock: mmap_read_unlock(mm); out: return error ? : unmapped_error; } |
56 56 64 9 56 56 18 56 56 431 429 213 14905 14936 14895 14944 3991 11695 11998 10811 12004 14727 14755 11069 11853 1269 1272 1266 12976 12996 10413 10750 612 613 609 2286 2288 2039 2046 3 3 3 3 3 3 4 4 4 10 10 3 3 3 3 4 4 10 10 3 3 3 3 2 2 1 5 2 3 3 1 1 53 53 546 681 134 134 143 143 143 143 6 4 2 8 2 6 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } static unsigned long enforced_nproc_rlimit(void) { unsigned long limit = RLIM_INFINITY; /* Is RLIMIT_NPROC currently enforced? */ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || (current_user_ns() != &init_user_ns)) limit = rlimit(RLIMIT_NPROC); return limit; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = security_create_user_ns(new); if (ret < 0) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } #if IS_ENABLED(CONFIG_BINFMT_MISC) kfree(ns->binfmt_misc); #endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (refcount_dec_and_test(&parent->ns.count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; }; /* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } /* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = true; key.count = count; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_up_base(extents, map, id, count); else extent = map_id_range_up_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } u32 map_id_up(struct uid_gid_map *map, u32 id) { return map_id_range_up(map, id, 1); } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup_array(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); if (set_cred_ucounts(cred) < 0) return -EINVAL; return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init); |
41 10 31 50 50 39 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | // SPDX-License-Identifier: GPL-2.0 /* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. */ #include <linux/log2.h> #include <linux/slab.h> #include <linux/overflow.h> #include <linux/vmalloc.h> #include <net/xdp_sock_drv.h> #include "xsk_queue.h" static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { struct xdp_umem_ring *umem_ring; struct xdp_rxtx_ring *rxtx_ring; if (umem_queue) return struct_size(umem_ring, desc, q->nentries); return struct_size(rxtx_ring, desc, q->nentries); } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) { struct xsk_queue *q; size_t size; q = kzalloc(sizeof(*q), GFP_KERNEL); if (!q) return NULL; q->nentries = nentries; q->ring_mask = nentries - 1; size = xskq_get_ring_size(q, umem_queue); /* size which is overflowing or close to SIZE_MAX will become 0 in * PAGE_ALIGN(), checking SIZE_MAX is enough due to the previous * is_power_of_2(), the rest will be handled by vmalloc_user() */ if (unlikely(size == SIZE_MAX)) { kfree(q); return NULL; } size = PAGE_ALIGN(size); q->ring = vmalloc_user(size); if (!q->ring) { kfree(q); return NULL; } q->ring_vmalloc_size = size; return q; } void xskq_destroy(struct xsk_queue *q) { if (!q) return; vfree(q->ring); kfree(q); } |
35 35 35 35 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/vmalloc.h> #include "rxe.h" #include "rxe_loc.h" #include "rxe_queue.h" int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, int comp_vector) { int count; if (cqe <= 0) { rxe_dbg_dev(rxe, "cqe(%d) <= 0\n", cqe); goto err1; } if (cqe > rxe->attr.max_cqe) { rxe_dbg_dev(rxe, "cqe(%d) > max_cqe(%d)\n", cqe, rxe->attr.max_cqe); goto err1; } if (cq) { count = queue_count(cq->queue, QUEUE_TYPE_TO_CLIENT); if (cqe < count) { rxe_dbg_cq(cq, "cqe(%d) < current # elements in queue (%d)\n", cqe, count); goto err1; } } return 0; err1: return -EINVAL; } int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, int comp_vector, struct ib_udata *udata, struct rxe_create_cq_resp __user *uresp) { int err; enum queue_type type; type = QUEUE_TYPE_TO_CLIENT; cq->queue = rxe_queue_init(rxe, &cqe, sizeof(struct rxe_cqe), type); if (!cq->queue) { rxe_dbg_dev(rxe, "unable to create cq\n"); return -ENOMEM; } err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, cq->queue->buf, cq->queue->buf_size, &cq->queue->ip); if (err) return err; cq->is_user = uresp; spin_lock_init(&cq->cq_lock); cq->ibcq.cqe = cqe; return 0; } int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe, struct rxe_resize_cq_resp __user *uresp, struct ib_udata *udata) { int err; err = rxe_queue_resize(cq->queue, (unsigned int *)&cqe, sizeof(struct rxe_cqe), udata, uresp ? &uresp->mi : NULL, NULL, &cq->cq_lock); if (!err) cq->ibcq.cqe = cqe; return err; } /* caller holds reference to cq */ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) { struct ib_event ev; int full; void *addr; unsigned long flags; spin_lock_irqsave(&cq->cq_lock, flags); full = queue_full(cq->queue, QUEUE_TYPE_TO_CLIENT); if (unlikely(full)) { rxe_err_cq(cq, "queue full\n"); spin_unlock_irqrestore(&cq->cq_lock, flags); if (cq->ibcq.event_handler) { ev.device = cq->ibcq.device; ev.element.cq = &cq->ibcq; ev.event = IB_EVENT_CQ_ERR; cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); } return -EBUSY; } addr = queue_producer_addr(cq->queue, QUEUE_TYPE_TO_CLIENT); memcpy(addr, cqe, sizeof(*cqe)); queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT); if ((cq->notify & IB_CQ_NEXT_COMP) || (cq->notify & IB_CQ_SOLICITED && solicited)) { cq->notify = 0; cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } spin_unlock_irqrestore(&cq->cq_lock, flags); return 0; } void rxe_cq_cleanup(struct rxe_pool_elem *elem) { struct rxe_cq *cq = container_of(elem, typeof(*cq), elem); if (cq->queue) rxe_queue_cleanup(cq->queue); } |
6 42 9 4 52 20 32 52 4 4 1 77 78 78 38 38 19 11 1 2 5 8 26 7 31 9 6 3 6 2 7 4 2 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 | // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ #include <net/tcp.h> #include <net/xfrm.h> #include <net/busy_poll.h> #include <net/rstreason.h> static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) return true; if (after(end_seq, s_win) && before(seq, e_win)) return true; return seq == e_win && seq == end_seq; } static enum tcp_tw_status tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, const struct sk_buff *skb, int mib_idx) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx, &tcptw->tw_last_oow_ack_time)) { /* Send ACK. Note, we do not put the bucket, * it will be released by caller. */ return TCP_TW_ACK_OOW; } /* We are rate-limiting, so just release the tw sock and drop skb. */ inet_twsk_put(tw); return TCP_TW_SUCCESS; } static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, u32 rcv_nxt) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; ao = rcu_dereference(tcptw->ao_info); if (unlikely(ao && seq < rcv_nxt)) WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1); #endif WRITE_ONCE(tcptw->tw_rcv_nxt, seq); } /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN * (and, probably, tail of data) and one or more our ACKs are lost. * * What is TIME-WAIT timeout? It is associated with maximal packet * lifetime in the internet, which results in wrong conclusion, that * it is set to catch "old duplicate segments" wandering out of their path. * It is not quite correct. This timeout is calculated so that it exceeds * maximal retransmission timeout enough to allow to lose one (or more) * segments sent by peer and our ACKs. This time may be calculated from RTO. * * When TIME-WAIT socket receives RST, it means that another end * finally closed and we are allowed to kill TIME-WAIT too. * * Second purpose of TIME-WAIT is catching old duplicate segments. * Well, certainly it is pure paranoia, but if we load TIME-WAIT * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. * * If we invented some more clever way to catch duplicates * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. * * The algorithm below is based on FORMAL INTERPRETATION of RFCs. * When you compare it to RFCs, please, read section SEGMENT ARRIVES * from the very beginning. * * NOTE. With recycling (and later with fin-wait-2) TW bucket * is _not_ stateless. It means, that strictly speaking we must * spinlock it. I do not want! Well, probability of misbehaviour * is ridiculously low and, seems, we could use some mb() tricks * to avoid misread sequence numbers, states etc. --ANK * * We don't need to initialize tmp_out.sack_ok as we don't use the results */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, u32 *tw_isn) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); struct tcp_options_received tmp_opt; bool paws_reject = false; int ts_recent_stamp; tmp_opt.saw_tstamp = 0; ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; tmp_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); tmp_opt.ts_recent_stamp = ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, rcv_nxt, rcv_nxt + tcptw->tw_rcv_wnd)) return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2); if (th->rst) goto kill; if (th->syn && !before(TCP_SKB_CB(skb)->seq, rcv_nxt)) return TCP_TW_RST; /* Dup ACK? */ if (!th->ack || !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* New data or FIN. If new data arrive after half-duplex close, * reset. */ if (!th->fin || TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1) return TCP_TW_RST; /* FIN arrived, enter true time-wait state. */ WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT); twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq, rcv_nxt); if (tmp_opt.saw_tstamp) { u64 ts = tcp_clock_ms(); WRITE_ONCE(tw->tw_entry_stamp, ts); WRITE_ONCE(tcptw->tw_ts_recent_stamp, div_u64(ts, MSEC_PER_SEC)); WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); } inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } /* * Now real TIME-WAIT state. * * RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: * * (1) assigns its initial sequence number for the new * connection to be larger than the largest sequence * number it used on the previous connection incarnation, * and * * (2) returns to TIME-WAIT state if the SYN turns out * to be an old duplicate". */ if (!paws_reject && (TCP_SKB_CB(skb)->seq == rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } } else { inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); } if (tmp_opt.saw_tstamp) { WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); WRITE_ONCE(tcptw->tw_ts_recent_stamp, ktime_get_seconds()); } inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* Out of window segment. All the segments are ACKed immediately. The only exception is new SYN. We accept it, if it is not old duplicate and we are not in danger to be killed by delayed old duplicates. RFC check is that it has newer sequence number works at rates <40Mbit/sec. However, if paws works, it is reliable AND even more, we even may relax silly seq space cutoff. RED-PEN: we violate main RFC requirement, if this SYN will appear old duplicate (i.e. we receive RST in reply to SYN-ACK), we must return socket to time-wait state. It is not good, but not fatal yet. */ if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, rcv_nxt) || (tmp_opt.saw_tstamp && (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; *tw_isn = isn; return TCP_TW_SYN; } if (paws_reject) __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. * * If it is ACKless SYN it may be both old duplicate * and new good SYN with random sequence number <rcv_nxt. * Do not reschedule in the last case. */ if (paws_reject || th->ack) inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } inet_twsk_put(tw); return TCP_TW_SUCCESS; } EXPORT_IPV6_MOD(tcp_timewait_state_process); static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) { #ifdef CONFIG_TCP_MD5SIG const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; /* * The timewait bucket does not have the key DB from the * sock structure. We just make a quick copy of the * md5 key being used (if indeed we are using one) * so the timewait ack generating code has the key. */ tcptw->tw_md5_key = NULL; if (!static_branch_unlikely(&tcp_md5_needed.key)) return; key = tp->af_specific->md5_lookup(sk, sk); if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); if (!tcptw->tw_md5_key) return; if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) goto out_free; tcp_md5_add_sigpool(); } return; out_free: WARN_ON_ONCE(1); kfree(tcptw->tw_md5_key); tcptw->tw_md5_key = NULL; #endif } /* * Move a socket to time-wait or dead fin-wait-2 state. */ void tcp_time_wait(struct sock *sk, int state, int timeo) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct inet_timewait_sock *tw; tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); tw->tw_mark = sk->sk_mark; tw->tw_priority = READ_ONCE(sk->sk_priority); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; /* refreshed when we enter true TIME-WAIT state */ tw->tw_entry_stamp = tcp_time_stamp_ms(tp); tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tw->tw_usec_ts = tp->tcp_usec_ts; tcptw->tw_last_oow_ack_time = 0; tcptw->tw_tx_delay = tp->tcp_tx_delay; tw->tw_txhash = sk->sk_txhash; tw->tw_tx_queue_mapping = sk->sk_tx_queue_mapping; #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING tw->tw_rx_queue_mapping = sk->sk_rx_queue_mapping; #endif #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); tw->tw_ipv6only = sk->sk_ipv6only; } #endif tcp_time_wait_init(sk, tcptw); tcp_ao_time_wait(tcptw, tp); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; /* Linkage updates. * Note that access to tw after this point is illegal. */ inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); tcp_done(sk); } EXPORT_SYMBOL(tcp_time_wait); #ifdef CONFIG_TCP_MD5SIG static void tcp_md5_twsk_free_rcu(struct rcu_head *head) { struct tcp_md5sig_key *key; key = container_of(head, struct tcp_md5sig_key, rcu); kfree(key); static_branch_slow_dec_deferred(&tcp_md5_needed); tcp_md5_release_sigpool(); } #endif void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed.key)) { struct tcp_timewait_sock *twsk = tcp_twsk(sk); if (twsk->tw_md5_key) call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu); } #endif tcp_ao_destroy_sock(sk, true); } EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor); void tcp_twsk_purge(struct list_head *net_exit_list) { bool purged_once = false; struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { if (net->ipv4.tcp_death_row.hashinfo->pernet) { /* Even if tw_refcount == 1, we must clean up kernel reqsk */ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo); } else if (!purged_once) { inet_twsk_purge(&tcp_hashinfo); purged_once = true; } } } /* Warning : This function is called without sk_listener being locked. * Be sure to read socket fields once, as their value could change under us. */ void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk_listener); int full_space = tcp_full_space(sk_listener); u32 window_clamp; __u8 rcv_wscale; u32 rcv_wnd; int mss; mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); else if (full_space < rcv_wnd * mss) full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(sk_listener, full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, rcv_wnd); ireq->rcv_wscale = rcv_wscale; } static void tcp_ecn_openreq_child(struct tcp_sock *tp, const struct request_sock *req) { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? TCP_ECN_MODE_RFC3168 : TCP_ECN_DISABLED); } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); bool ca_got_dst = false; if (ca_key != TCP_CA_UNSPEC) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && bpf_try_module_get(ca, ca->owner))) { icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; ca_got_dst = true; } rcu_read_unlock(); } /* If no valid choice made yet, assign current system default ca. */ if (!ca_got_dst && (!icsk->icsk_ca_setsockopt || !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); } EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child); static void smc_check_reset_syn_req(const struct tcp_sock *oldtp, struct request_sock *req, struct tcp_sock *newtp) { #if IS_ENABLED(CONFIG_SMC) struct inet_request_sock *ireq; if (static_branch_unlikely(&tcp_have_smc)) { ireq = inet_rsk(req); if (oldtp->syn_smc && !ireq->smc_ok) newtp->syn_smc = 0; } #endif } /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * * Actually, we could lots of memory writes here. tp of listening * socket contains all necessary default parameters. */ struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk; const struct tcp_sock *oldtp; struct tcp_sock *newtp; u32 seq; if (!newsk) return NULL; newicsk = inet_csk(newsk); newtp = tcp_sk(newsk); oldtp = tcp_sk(sk); smc_check_reset_syn_req(oldtp, req, newtp); /* Now setup tcp_sock */ newtp->pred_flags = 0; seq = treq->rcv_isn + 1; newtp->rcv_wup = seq; WRITE_ONCE(newtp->copied_seq, seq); WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; seq = treq->snt_isn + 1; newtp->snd_sml = newtp->snd_una = seq; WRITE_ONCE(newtp->snd_nxt, seq); newtp->snd_up = seq; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); newicsk->icsk_ack.lrcvtime = tcp_jiffies32; newtp->lsndtime = tcp_jiffies32; newsk->sk_txhash = READ_ONCE(treq->txhash); newtp->total_retrans = req->num_retrans; tcp_init_xmit_timers(newsk); WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; newtp->rx_opt.sack_ok = ireq->sack_ok; newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; } else { newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; newtp->window_clamp = min(newtp->window_clamp, 65535U); } newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; newtp->max_window = newtp->snd_wnd; if (newtp->rx_opt.tstamp_ok) { newtp->tcp_usec_ts = treq->req_usec_ts; newtp->rx_opt.ts_recent = req->ts_recent; newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->tcp_usec_ts = 0; newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } if (req->num_timeout) { newtp->total_rto = req->num_timeout; newtp->undo_marker = treq->snt_isn; if (newtp->tcp_usec_ts) { newtp->retrans_stamp = treq->snt_synack; newtp->total_rto_time = (u32)(tcp_clock_us() - newtp->retrans_stamp) / USEC_PER_MSEC; } else { newtp->retrans_stamp = div_u64(treq->snt_synack, USEC_PER_SEC / TCP_TS_HZ); newtp->total_rto_time = tcp_clock_ms() - newtp->retrans_stamp; } newtp->total_rto_recoveries = 1; } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ #endif #ifdef CONFIG_TCP_AO newtp->ao_info = NULL; if (tcp_rsk_used_ao(req)) { struct tcp_ao_key *ao_key; ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1); if (ao_key) newtp->tcp_header_len += tcp_ao_len_aligned(ao_key); } #endif if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); newtp->bpf_chg_cc_inprogress = 0; tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1); return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); /* * Process an incoming packet for SYN_RECV sockets represented as a * request_sock. Normally sk is the listener socket but for TFO it * points to the child socket. * * XXX (TFO) - The current impl contains a special check for ack * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? * * We don't need to initialize tmp_opt.sack_ok as we don't use the results * * Note: If @fastopen is true, this can be called from process context. * Otherwise, this is from BH context. */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *req_stolen, enum skb_drop_reason *drop_reason) { struct tcp_options_received tmp_opt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool tsecr_reject = false; bool paws_reject = false; bool own_req; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; if (tmp_opt.rcv_tsecr) { if (inet_rsk(req)->tstamp_ok && !fastopen) tsecr_reject = !between(tmp_opt.rcv_tsecr, tcp_rsk(req)->snt_tsval_first, READ_ONCE(tcp_rsk(req)->snt_tsval_last)); tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; } /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. */ tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } /* Check for pure retransmitted SYN. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && flg == TCP_FLAG_SYN && !paws_reject) { /* * RFC793 draws (Incorrectly! It was fixed in RFC1122) * this case on figure 6 and figure 8, but formal * protocol description says NOTHING. * To be more exact, it says that we should send ACK, * because this segment (at least, if it has no data) * is out of window. * * CONCLUSION: RFC793 (even with RFC1122) DOES NOT * describe SYN-RECV state. All the description * is wrong, we cannot believe to it and should * rely only on common sense and implementation * experience. * * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. * * Note that even if there is new data in the SYN packet * they will be thrown away too. * * Reset timer after retransmitting SYNACK, similar to * the idea of fast retransmit in recovery. */ if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time) && !inet_rtx_syn_ack(sk, req)) { unsigned long expires = jiffies; expires += reqsk_timeout(req, TCP_RTO_MAX); if (!fastopen) mod_timer_pending(&req->rsk_timer, expires); else req->rsk_timer.expires = expires; } return NULL; } /* Further reproduces section "SEGMENT ARRIVES" for state SYN-RECEIVED of RFC793. It is broken, however, it does not work only when SYNs are crossed. You would think that SYN crossing is impossible here, since we should have a SYN_SENT socket (from connect()) on our end, but this is not true if the crossed SYNs were sent to both ends by a malicious third party. We must defend against this, and to do that we first verify the ACK (as per RFC793, page 36) and reset if it is invalid. Is this a true full defense? To convince ourselves, let us consider a way in which the ACK test can still pass in this 'malicious crossed SYNs' case. Malicious sender sends identical SYNs (and thus identical sequence numbers) to both A and B: A: gets SYN, seq=7 B: gets SYN, seq=7 By our good fortune, both A and B select the same initial send sequence number of seven :-) A: sends SYN|ACK, seq=7, ack_seq=8 B: sends SYN|ACK, seq=7, ack_seq=8 So we are now A eating this SYN|ACK, ACK test passes. So does sequence test, SYN is truncated, and thus we consider it a bare ACK. If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this bare ACK. Otherwise, we create an established connection. Both ends (listening sockets) accept the new incoming connection and try to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. But generally, we should (RFC lies!) to accept ACK from SYNACK both here and in tcp_rcv_state_process(). tcp_rcv_state_process() does not, hence, we do not too. Note that the case is absolutely generic: we cannot optimize anything here without violating protocol. All the checks must be made before attempt to create socket. */ /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket. * Note that the ACK validity check for a Fast Open socket is done * elsewhere and is checked directly against the child socket rather * than req because user data may have been sent out. */ if ((flg & TCP_FLAG_ACK) && !fastopen && (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) return sk; /* RFC793: "first check sequence number". */ if (paws_reject || tsecr_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) { SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS); NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); } else if (tsecr_reject) { SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR); NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED); } else { SKB_DR_SET(*drop_reason, TCP_OVERWINDOW); } return NULL; } /* In sequence, PAWS is OK. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ flg &= ~TCP_FLAG_SYN; } /* RFC793: "second check the RST bit" and * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. * * XXX (TFO) - if we ever allow "data after SYN", the * following check needs to be removed. */ if (!(flg & TCP_FLAG_ACK)) return NULL; /* For Fast Open no more processing is needed (sk is the * child socket). */ if (fastopen) return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, req, &own_req); if (!child) goto listen_overflow; if (own_req && tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval; if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); return child; } sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); *req_stolen = !own_req; return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW); if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) { inet_rsk(req)->acked = 1; return NULL; } embryonic_reset: if (!(flg & TCP_FLAG_RST)) { /* Received a bad SYN pkt - for TFO We try not to reset * the local connection unless it's really necessary to * avoid becoming vulnerable to outside attack aiming at * resetting legit local connections. */ req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); tcp_reset(sk, skb); } if (!fastopen) { bool unlinked = inet_csk_reqsk_queue_drop(sk, req); if (unlinked) __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); *req_stolen = !unlinked; } return NULL; } EXPORT_IPV6_MOD(tcp_check_req); /* * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with * the new socket. * * For the vast majority of cases child->sk_state will be TCP_SYN_RECV * when entering. But other states are possible due to a race condition * where after __inet_lookup_established() fails but before the listener * locked is obtained, other packets cause the same connection to * be created. */ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb) __releases(&((child)->sk_lock.slock)) { enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; int state = child->sk_state; /* record sk_napi_id and sk_rx_queue_mapping of child. */ sk_mark_napi_id_set(child, skb); tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { reason = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening * socket does not protect us more. */ __sk_add_backlog(child, skb); } bh_unlock_sock(child); sock_put(child); return reason; } EXPORT_IPV6_MOD(tcp_child_process); |
14315 421 990 249 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_JIFFIES_H #define _LINUX_JIFFIES_H #include <linux/cache.h> #include <linux/limits.h> #include <linux/math64.h> #include <linux/minmax.h> #include <linux/types.h> #include <linux/time.h> #include <linux/timex.h> #include <vdso/jiffies.h> #include <asm/param.h> /* for HZ */ #include <generated/timeconst.h> /* * The following defines establish the engineering parameters of the PLL * model. The HZ variable establishes the timer interrupt frequency, 100 Hz * for the SunOS kernel, 256 Hz for the Ultrix kernel and 1024 Hz for the * OSF/1 kernel. The SHIFT_HZ define expresses the same value as the * nearest power of two in order to avoid hardware multiply operations. */ #if HZ >= 12 && HZ < 24 # define SHIFT_HZ 4 #elif HZ >= 24 && HZ < 48 # define SHIFT_HZ 5 #elif HZ >= 48 && HZ < 96 # define SHIFT_HZ 6 #elif HZ >= 96 && HZ < 192 # define SHIFT_HZ 7 #elif HZ >= 192 && HZ < 384 # define SHIFT_HZ 8 #elif HZ >= 384 && HZ < 768 # define SHIFT_HZ 9 #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #elif HZ >= 1536 && HZ < 3072 # define SHIFT_HZ 11 #elif HZ >= 3072 && HZ < 6144 # define SHIFT_HZ 12 #elif HZ >= 6144 && HZ < 12288 # define SHIFT_HZ 13 #else # error Invalid value of HZ. #endif /* Suppose we want to divide two numbers NOM and DEN: NOM/DEN, then we can * improve accuracy by shifting LSH bits, hence calculating: * (NOM << LSH) / DEN * This however means trouble for large NOM, because (NOM << LSH) may no * longer fit in 32 bits. The following way of calculating this gives us * some slack, under the following conditions: * - (NOM / DEN) fits in (32 - LSH) bits. * - (NOM % DEN) fits in (32 - LSH) bits. */ #define SH_DIV(NOM,DEN,LSH) ( (((NOM) / (DEN)) << (LSH)) \ + ((((NOM) % (DEN)) << (LSH)) + (DEN) / 2) / (DEN)) /* LATCH is used in the interval timer and ftape setup. */ #define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ extern int register_refined_jiffies(long clock_tick_rate); /* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ #define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) #ifndef __jiffy_arch_data #define __jiffy_arch_data #endif /* * The 64-bit value is not atomic on 32-bit systems - you MUST NOT read it * without sampling the sequence number in jiffies_lock. * get_jiffies_64() will do this for you as appropriate. * * jiffies and jiffies_64 are at the same address for little-endian systems * and for 64-bit big-endian systems. * On 32-bit big-endian systems, jiffies is the lower 32 bits of jiffies_64 * (i.e., at address @jiffies_64 + 4). * See arch/ARCH/kernel/vmlinux.lds.S */ extern u64 __cacheline_aligned_in_smp jiffies_64; extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); #else /** * get_jiffies_64 - read the 64-bit non-atomic jiffies_64 value * * When BITS_PER_LONG < 64, this uses sequence number sampling using * jiffies_lock to protect the 64-bit read. * * Return: current 64-bit jiffies value */ static inline u64 get_jiffies_64(void) { return (u64)jiffies; } #endif /** * DOC: General information about time_* inlines * * These inlines deal with timer wrapping correctly. You are strongly encouraged * to use them: * * #. Because people otherwise forget * #. Because if the timer wrap changes in future you won't have to alter your * driver code. */ /** * time_after - returns true if the time a is after time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Do this with "<0" and ">=0" to only test the sign of the result. A * good compiler would generate better code (and a really good compiler * wouldn't care). Gcc is currently neither. * * Return: %true is time a is after time b, otherwise %false. */ #define time_after(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((b) - (a)) < 0)) /** * time_before - returns true if the time a is before time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is before time b, otherwise %false. */ #define time_before(a,b) time_after(b,a) /** * time_after_eq - returns true if the time a is after or the same as time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is after or the same as time b, otherwise %false. */ #define time_after_eq(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((a) - (b)) >= 0)) /** * time_before_eq - returns true if the time a is before or the same as time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is before or the same as time b, otherwise %false. */ #define time_before_eq(a,b) time_after_eq(b,a) /** * time_in_range - Calculate whether a is in the range of [b, c]. * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c], otherwise %false. */ #define time_in_range(a,b,c) \ (time_after_eq(a,b) && \ time_before_eq(a,c)) /** * time_in_range_open - Calculate whether a is in the range of [b, c). * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c), otherwise %false. */ #define time_in_range_open(a,b,c) \ (time_after_eq(a,b) && \ time_before(a,c)) /* Same as above, but does so with platform independent 64bit types. * These must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). */ /** * time_after64 - returns true if the time a is after time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is after time b, otherwise %false. */ #define time_after64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)((b) - (a)) < 0)) /** * time_before64 - returns true if the time a is before time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is before time b, otherwise %false. */ #define time_before64(a,b) time_after64(b,a) /** * time_after_eq64 - returns true if the time a is after or the same as time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is after or the same as time b, otherwise %false. */ #define time_after_eq64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)((a) - (b)) >= 0)) /** * time_before_eq64 - returns true if the time a is before or the same as time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is before or the same as time b, otherwise %false. */ #define time_before_eq64(a,b) time_after_eq64(b,a) /** * time_in_range64 - Calculate whether a is in the range of [b, c]. * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c], otherwise %false. */ #define time_in_range64(a, b, c) \ (time_after_eq64(a, b) && \ time_before_eq64(a, c)) /* * These eight macros compare jiffies[_64] and 'a' for convenience. */ /** * time_is_before_jiffies - return true if a is before jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is before jiffies, otherwise %false. */ #define time_is_before_jiffies(a) time_after(jiffies, a) /** * time_is_before_jiffies64 - return true if a is before jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is before jiffies_64, otherwise %false. */ #define time_is_before_jiffies64(a) time_after64(get_jiffies_64(), a) /** * time_is_after_jiffies - return true if a is after jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is after jiffies, otherwise %false. */ #define time_is_after_jiffies(a) time_before(jiffies, a) /** * time_is_after_jiffies64 - return true if a is after jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is after jiffies_64, otherwise %false. */ #define time_is_after_jiffies64(a) time_before64(get_jiffies_64(), a) /** * time_is_before_eq_jiffies - return true if a is before or equal to jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is before or the same as jiffies, otherwise %false. */ #define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a) /** * time_is_before_eq_jiffies64 - return true if a is before or equal to jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is before or the same jiffies_64, otherwise %false. */ #define time_is_before_eq_jiffies64(a) time_after_eq64(get_jiffies_64(), a) /** * time_is_after_eq_jiffies - return true if a is after or equal to jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is after or the same as jiffies, otherwise %false. */ #define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a) /** * time_is_after_eq_jiffies64 - return true if a is after or equal to jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is after or the same as jiffies_64, otherwise %false. */ #define time_is_after_eq_jiffies64(a) time_before_eq64(get_jiffies_64(), a) /* * Have the 32-bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) /* * Change timeval to jiffies, trying to avoid the * most obvious overflows.. * * And some not so obvious. * * Note that we don't want to return LONG_MAX, because * for various timeout reasons we often end up having * to wait "jiffies+1" in order to guarantee that we wait * at _least_ "jiffies" - so "jiffies+1" had better still * be positive. */ #define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1) extern unsigned long preset_lpj; /* * We want to do realistic conversions of time so we need to use the same * values the update wall clock code uses as the jiffies size. This value * is: TICK_NSEC (which is defined in timex.h). This * is a constant and is in nanoseconds. We will use scaled math * with a set of scales defined here as SEC_JIFFIE_SC, USEC_JIFFIE_SC and * NSEC_JIFFIE_SC. Note that these defines contain nothing but * constants and so are computed at compile time. SHIFT_HZ (computed in * timex.h) adjusts the scaling for different HZ values. * Scaled math??? What is that? * * Scaled math is a way to do integer math on values that would, * otherwise, either overflow, underflow, or cause undesired div * instructions to appear in the execution path. In short, we "scale" * up the operands so they take more bits (more precision, less * underflow), do the desired operation and then "scale" the result back * by the same amount. If we do the scaling by shifting we avoid the * costly mpy and the dastardly div instructions. * Suppose, for example, we want to convert from seconds to jiffies * where jiffies is defined in nanoseconds as NSEC_PER_JIFFIE. The * simple math is: jiff = (sec * NSEC_PER_SEC) / NSEC_PER_JIFFIE; We * observe that (NSEC_PER_SEC / NSEC_PER_JIFFIE) is a constant which we * might calculate at compile time, however, the result will only have * about 3-4 bits of precision (less for smaller values of HZ). * * So, we scale as follows: * jiff = (sec) * (NSEC_PER_SEC / NSEC_PER_JIFFIE); * jiff = ((sec) * ((NSEC_PER_SEC * SCALE)/ NSEC_PER_JIFFIE)) / SCALE; * Then we make SCALE a power of two so: * jiff = ((sec) * ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) >> SCALE; * Now we define: * #define SEC_CONV = ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) * jiff = (sec * SEC_CONV) >> SCALE; * * Often the math we use will expand beyond 32-bits so we tell C how to * do this and pass the 64-bit result of the mpy through the ">> SCALE" * which should take the result back to 32-bits. We want this expansion * to capture as much precision as possible. At the same time we don't * want to overflow so we pick the SCALE to avoid this. In this file, * that means using a different scale for each range of HZ values (as * defined in timex.h). * * For those who want to know, gcc will give a 64-bit result from a "*" * operator if the result is a long long AND at least one of the * operands is cast to long long (usually just prior to the "*" so as * not to confuse it into thinking it really has a 64-bit operand, * which, buy the way, it can do, but it takes more code and at least 2 * mpys). * We also need to be aware that one second in nanoseconds is only a * couple of bits away from overflowing a 32-bit word, so we MUST use * 64-bits to get the full range time in nanoseconds. */ /* * Here are the scales we will use. One for seconds, nanoseconds and * microseconds. * * Within the limits of cpp we do a rough cut at the SEC_JIFFIE_SC and * check if the sign bit is set. If not, we bump the shift count by 1. * (Gets an extra bit of precision where we can use it.) * We know it is set for HZ = 1024 and HZ = 100 not for 1000. * Haven't tested others. * Limits of cpp (for #if expressions) only long (no long long), but * then we only need the most signicant bit. */ #define SEC_JIFFIE_SC (31 - SHIFT_HZ) #if !((((NSEC_PER_SEC << 2) / TICK_NSEC) << (SEC_JIFFIE_SC - 2)) & 0x80000000) #undef SEC_JIFFIE_SC #define SEC_JIFFIE_SC (32 - SHIFT_HZ) #endif #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29) #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) /* * The maximum jiffy value is (MAX_INT >> 1). Here we translate that * into seconds. The 64-bit case will overflow if we are not careful, * so use the messy SH_DIV macro to do it. Still all constants. */ #if BITS_PER_LONG < 64 # define MAX_SEC_IN_JIFFIES \ (long)((u64)((u64)MAX_JIFFY_OFFSET * TICK_NSEC) / NSEC_PER_SEC) #else /* take care of overflow on 64-bit machines */ # define MAX_SEC_IN_JIFFIES \ (SH_DIV((MAX_JIFFY_OFFSET >> SEC_JIFFIE_SC) * TICK_NSEC, NSEC_PER_SEC, 1) - 1) #endif /* * Convert various time units to each other: */ extern unsigned int jiffies_to_msecs(const unsigned long j); extern unsigned int jiffies_to_usecs(const unsigned long j); /** * jiffies_to_nsecs - Convert jiffies to nanoseconds * @j: jiffies value * * Return: nanoseconds value */ static inline u64 jiffies_to_nsecs(const unsigned long j) { return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC; } extern u64 jiffies64_to_nsecs(u64 j); extern u64 jiffies64_to_msecs(u64 j); extern unsigned long __msecs_to_jiffies(const unsigned int m); #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) /* * HZ is equal to or smaller than 1000, and 1000 is a nice round * multiple of HZ, divide with the factor between them, but round * upwards: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); } #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) /* * HZ is larger than 1000, and HZ is a nice round multiple of 1000 - * simply multiply with the factor between them. * * But first make sure the multiplication result cannot overflow: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return m * (HZ / MSEC_PER_SEC); } #else /* * Generic case - multiply, round and divide. But first check that if * we are doing a net multiplication, that we wouldn't overflow: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32; } #endif /** * msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds * * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. * for the details see _msecs_to_jiffies() * * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code. __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The HZ range specific helpers _msecs_to_jiffies() are called both * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. * * Return: jiffies value */ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) { if (__builtin_constant_p(m)) { if ((int)m < 0) return MAX_JIFFY_OFFSET; return _msecs_to_jiffies(m); } else { return __msecs_to_jiffies(m); } } /** * secs_to_jiffies: - convert seconds to jiffies * @_secs: time in seconds * * Conversion is done by simple multiplication with HZ * * secs_to_jiffies() is defined as a macro rather than a static inline * function so it can be used in static initializers. * * Return: jiffies value */ #define secs_to_jiffies(_secs) (unsigned long)((_secs) * HZ) extern unsigned long __usecs_to_jiffies(const unsigned int u); #if !(USEC_PER_SEC % HZ) static inline unsigned long _usecs_to_jiffies(const unsigned int u) { return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); } #else static inline unsigned long _usecs_to_jiffies(const unsigned int u) { return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) >> USEC_TO_HZ_SHR32; } #endif /** * usecs_to_jiffies: - convert microseconds to jiffies * @u: time in microseconds * * conversion is done as follows: * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows as for msecs_to_jiffies. * * usecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code. __usecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The HZ range specific helpers _usecs_to_jiffies() are called both * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. * * Return: jiffies value */ static __always_inline unsigned long usecs_to_jiffies(const unsigned int u) { if (__builtin_constant_p(u)) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return _usecs_to_jiffies(u); } else { return __usecs_to_jiffies(u); } } extern unsigned long timespec64_to_jiffies(const struct timespec64 *value); extern void jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value); extern clock_t jiffies_to_clock_t(unsigned long x); static inline clock_t jiffies_delta_to_clock_t(long delta) { return jiffies_to_clock_t(max(0L, delta)); } static inline unsigned int jiffies_delta_to_msecs(long delta) { return jiffies_to_msecs(max(0L, delta)); } extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x); extern u64 nsecs_to_jiffies64(u64 n); extern unsigned long nsecs_to_jiffies(u64 n); #define TIMESTAMP_SIZE 30 #endif |
46 3 43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | // SPDX-License-Identifier: GPL-2.0 /* * Handling of different ABIs (personalities). * * We group personalities into execution domains which have their * own handlers for kernel entry points, signal mapping, etc... * * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) */ #include <linux/init.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/module.h> #include <linux/personality.h> #include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/syscalls.h> #include <linux/sysctl.h> #include <linux/types.h> #ifdef CONFIG_PROC_FS static int execdomains_proc_show(struct seq_file *m, void *v) { seq_puts(m, "0-0\tLinux \t[kernel]\n"); return 0; } static int __init proc_execdomains_init(void) { proc_create_single("execdomains", 0, NULL, execdomains_proc_show); return 0; } module_init(proc_execdomains_init); #endif SYSCALL_DEFINE1(personality, unsigned int, personality) { unsigned int old = current->personality; if (personality != 0xffffffff) set_personality(personality); return old; } |
11816 11943 11937 12 2489 681 9879 252 9756 240 1465 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | #ifndef _LINUX_JHASH_H #define _LINUX_JHASH_H /* jhash.h: Jenkins hash support. * * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) * * https://burtleburtle.net/bob/hash/ * * These are the credits from Bob's sources: * * lookup3.c, by Bob Jenkins, May 2006, Public Domain. * * These are functions for producing 32-bit hashes for hash table lookup. * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() * are externally useful functions. Routines to test the hash are included * if SELF_TEST is defined. You can use this free for any purpose. It's in * the public domain. It has no warranty. * * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org) * * I've modified Bob's hash to be useful in the Linux kernel, and * any bugs present are my fault. * Jozsef */ #include <linux/bitops.h> #include <linux/unaligned/packed_struct.h> /* Best hash sizes are of power of two */ #define jhash_size(n) ((u32)1<<(n)) /* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */ #define jhash_mask(n) (jhash_size(n)-1) /* __jhash_mix - mix 3 32-bit values reversibly. */ #define __jhash_mix(a, b, c) \ { \ a -= c; a ^= rol32(c, 4); c += b; \ b -= a; b ^= rol32(a, 6); a += c; \ c -= b; c ^= rol32(b, 8); b += a; \ a -= c; a ^= rol32(c, 16); c += b; \ b -= a; b ^= rol32(a, 19); a += c; \ c -= b; c ^= rol32(b, 4); b += a; \ } /* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */ #define __jhash_final(a, b, c) \ { \ c ^= b; c -= rol32(b, 14); \ a ^= c; a -= rol32(c, 11); \ b ^= a; b -= rol32(a, 25); \ c ^= b; c -= rol32(b, 16); \ a ^= c; a -= rol32(c, 4); \ b ^= a; b -= rol32(a, 14); \ c ^= b; c -= rol32(b, 24); \ } /* An arbitrary initial parameter */ #define JHASH_INITVAL 0xdeadbeef /* jhash - hash an arbitrary key * @k: sequence of bytes as key * @length: the length of the key * @initval: the previous hash, or an arbitrary value * * The generic version, hashes an arbitrary sequence of bytes. * No alignment or length assumptions are made about the input key. * * Returns the hash value of the key. The result depends on endianness. */ static inline u32 jhash(const void *key, u32 length, u32 initval) { u32 a, b, c; const u8 *k = key; /* Set up the internal state */ a = b = c = JHASH_INITVAL + length + initval; /* All but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { a += __get_unaligned_cpu32(k); b += __get_unaligned_cpu32(k + 4); c += __get_unaligned_cpu32(k + 8); __jhash_mix(a, b, c); length -= 12; k += 12; } /* Last block: affect all 32 bits of (c) */ switch (length) { case 12: c += (u32)k[11]<<24; fallthrough; case 11: c += (u32)k[10]<<16; fallthrough; case 10: c += (u32)k[9]<<8; fallthrough; case 9: c += k[8]; fallthrough; case 8: b += (u32)k[7]<<24; fallthrough; case 7: b += (u32)k[6]<<16; fallthrough; case 6: b += (u32)k[5]<<8; fallthrough; case 5: b += k[4]; fallthrough; case 4: a += (u32)k[3]<<24; fallthrough; case 3: a += (u32)k[2]<<16; fallthrough; case 2: a += (u32)k[1]<<8; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* jhash2 - hash an array of u32's * @k: the key which must be an array of u32's * @length: the number of u32's in the key * @initval: the previous hash, or an arbitrary value * * Returns the hash value of the key. */ static inline u32 jhash2(const u32 *k, u32 length, u32 initval) { u32 a, b, c; /* Set up the internal state */ a = b = c = JHASH_INITVAL + (length<<2) + initval; /* Handle most of the key */ while (length > 3) { a += k[0]; b += k[1]; c += k[2]; __jhash_mix(a, b, c); length -= 3; k += 3; } /* Handle the last 3 u32's */ switch (length) { case 3: c += k[2]; fallthrough; case 2: b += k[1]; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */ static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; b += initval; c += initval; __jhash_final(a, b, c); return c; } static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval) { return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2)); } static inline u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); } static inline u32 jhash_1word(u32 a, u32 initval) { return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2)); } #endif /* _LINUX_JHASH_H */ |
2 3218 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_RPS_H #define _NET_RPS_H #include <linux/types.h> #include <linux/static_key.h> #include <net/sock.h> #include <net/hotdata.h> #ifdef CONFIG_RPS extern struct static_key_false rps_needed; extern struct static_key_false rfs_needed; /* * This structure holds an RPS map which can be of variable length. The * map is an array of CPUs. */ struct rps_map { unsigned int len; struct rcu_head rcu; u16 cpus[]; }; #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + ((_num) * sizeof(u16))) /* * The rps_dev_flow structure contains the mapping of a flow to a CPU, the * tail pointer for that CPU's input queue at the time of last enqueue, and * a hardware filter index. */ struct rps_dev_flow { u16 cpu; u16 filter; unsigned int last_qtail; }; #define RPS_NO_FILTER 0xffff /* * The rps_dev_flow_table structure contains a table of flow mappings. */ struct rps_dev_flow_table { u8 log; struct rcu_head rcu; struct rps_dev_flow flows[]; }; #define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ ((_num) * sizeof(struct rps_dev_flow))) /* * The rps_sock_flow_table contains mappings of flows to the last CPU * on which they were processed by the application (set in recvmsg). * Each entry is a 32bit value. Upper part is the high-order bits * of flow hash, lower part is CPU number. * rps_cpu_mask is used to partition the space, depending on number of * possible CPUs : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1 * For example, if 64 CPUs are possible, rps_cpu_mask = 0x3f, * meaning we use 32-6=26 bits for the hash. */ struct rps_sock_flow_table { u32 mask; u32 ents[] ____cacheline_aligned_in_smp; }; #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) #define RPS_NO_CPU 0xffff static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, u32 hash) { unsigned int index = hash & table->mask; u32 val = hash & ~net_hotdata.rps_cpu_mask; /* We only give a hint, preemption can change CPU under us */ val |= raw_smp_processor_id(); /* The following WRITE_ONCE() is paired with the READ_ONCE() * here, and another one in get_rps_cpu(). */ if (READ_ONCE(table->ents[index]) != val) WRITE_ONCE(table->ents[index], val); } #endif /* CONFIG_RPS */ static inline void sock_rps_record_flow_hash(__u32 hash) { #ifdef CONFIG_RPS struct rps_sock_flow_table *sock_flow_table; if (!hash) return; rcu_read_lock(); sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); if (sock_flow_table) rps_record_sock_flow(sock_flow_table, hash); rcu_read_unlock(); #endif } static inline void sock_rps_record_flow(const struct sock *sk) { #ifdef CONFIG_RPS if (static_branch_unlikely(&rfs_needed)) { /* Reading sk->sk_rxhash might incur an expensive cache line * miss. * * TCP_ESTABLISHED does cover almost all states where RFS * might be useful, and is cheaper [1] than testing : * IPv4: inet_sk(sk)->inet_daddr * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) * OR an additional socket flag * [1] : sk_state and sk_prot are in the same cache line. */ if (sk->sk_state == TCP_ESTABLISHED) { /* This READ_ONCE() is paired with the WRITE_ONCE() * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). */ sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); } } #endif } static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd) { #ifdef CONFIG_RPS return ++sd->input_queue_tail; #else return 0; #endif } static inline void rps_input_queue_tail_save(u32 *dest, u32 tail) { #ifdef CONFIG_RPS WRITE_ONCE(*dest, tail); #endif } static inline void rps_input_queue_head_add(struct softnet_data *sd, int val) { #ifdef CONFIG_RPS WRITE_ONCE(sd->input_queue_head, sd->input_queue_head + val); #endif } static inline void rps_input_queue_head_incr(struct softnet_data *sd) { rps_input_queue_head_add(sd, 1); } #endif /* _NET_RPS_H */ |
2140 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UNWIND_H #define _ASM_X86_UNWIND_H #include <linux/sched.h> #include <linux/ftrace.h> #include <linux/rethook.h> #include <asm/ptrace.h> #include <asm/stacktrace.h> #define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) #define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) struct unwind_state { struct stack_info stack_info; unsigned long stack_mask; struct task_struct *task; int graph_idx; #if defined(CONFIG_RETHOOK) struct llist_node *kr_cur; #endif bool error; #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; struct pt_regs *regs, *prev_regs; #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; /* * If non-NULL: The current frame is incomplete and doesn't contain a * valid BP. When looking for the next frame, use this instead of the * non-existent saved BP. */ unsigned long *next_bp; struct pt_regs *regs; #else unsigned long *sp; #endif }; void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); bool unwind_next_frame(struct unwind_state *state); unsigned long unwind_get_return_address(struct unwind_state *state); unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } static inline bool unwind_error(struct unwind_state *state) { return state->error; } static inline void unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { first_frame = first_frame ? : get_stack_pointer(task, regs); __unwind_start(state, task, regs, first_frame); } #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) /* * If 'partial' returns true, only the iret frame registers are valid. */ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { if (unwind_done(state)) return NULL; if (partial) { #ifdef CONFIG_UNWINDER_ORC *partial = !state->full_regs; #else *partial = false; #endif } return state->regs; } #else static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, bool *partial) { return NULL; } #endif #ifdef CONFIG_UNWINDER_ORC void unwind_init(void); void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); #else static inline void unwind_init(void) {} static inline void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size) {} #endif static inline unsigned long unwind_recover_rethook(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { #ifdef CONFIG_RETHOOK if (is_rethook_trampoline(addr)) return rethook_find_ret_addr(state->task, (unsigned long)addr_p, &state->kr_cur); #endif return addr; } /* Recover the return address modified by rethook and ftrace_graph. */ static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { unsigned long ret; ret = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, addr_p); return unwind_recover_rethook(state, ret, addr_p); } /* * This disables KASAN checking when reading a value from another task's stack, * since the other task could be running on another CPU and could have poisoned * the stack in the meantime. */ #define READ_ONCE_TASK_STACK(task, x) \ ({ \ unsigned long val; \ if (task == current) \ val = READ_ONCE(x); \ else \ val = READ_ONCE_NOCHECK(x); \ val; \ }) static inline bool task_on_another_cpu(struct task_struct *task) { #ifdef CONFIG_SMP return task != current && task->on_cpu; #else return false; #endif } #endif /* _ASM_X86_UNWIND_H */ |
63 282 3388 694 30 28 19 5 33 84 25 172 6021 7149 95 18037 2352 7 2905 117 4115 131 1 733 799 57 93 39 41 363 363 427 321 107 95 5 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* internal.h: mm/ internal definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H #include <linux/fs.h> #include <linux/khugepaged.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/pagemap.h> #include <linux/pagewalk.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> #include <linux/tracepoint-defs.h> /* Internal core VMA manipulation functions. */ #include "vma.h" struct folio_batch; /* * Maintains state across a page table move. The operation assumes both source * and destination VMAs already exist and are specified by the user. * * Partial moves are permitted, but the old and new ranges must both reside * within a VMA. * * mmap lock must be held in write and VMA write locks must be held on any VMA * that is visible. * * Use the PAGETABLE_MOVE() macro to initialise this struct. * * The old_addr and new_addr fields are updated as the page table move is * executed. * * NOTE: The page table move is affected by reading from [old_addr, old_end), * and old_addr may be updated for better page table alignment, so len_in * represents the length of the range being copied as specified by the user. */ struct pagetable_move_control { struct vm_area_struct *old; /* Source VMA. */ struct vm_area_struct *new; /* Destination VMA. */ unsigned long old_addr; /* Address from which the move begins. */ unsigned long old_end; /* Exclusive address at which old range ends. */ unsigned long new_addr; /* Address to move page tables to. */ unsigned long len_in; /* Bytes to remap specified by user. */ bool need_rmap_locks; /* Do rmap locks need to be taken? */ bool for_stack; /* Is this an early temp stack being moved? */ }; #define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \ struct pagetable_move_control name = { \ .old = old_, \ .new = new_, \ .old_addr = old_addr_, \ .old_end = (old_addr_) + (len_), \ .new_addr = new_addr_, \ .len_in = len_, \ } /* * The set of flags that only affect watermark checking and reclaim * behaviour. This is used by the MM to obey the caller constraints * about IO, FS and watermark checking while ignoring placement * hints such as HIGHMEM usage. */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) /* Control allocation cpuset and node placement constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) /* * Different from WARN_ON_ONCE(), no warning will be issued * when we specify __GFP_NOWARN. */ #define WARN_ON_ONCE_GFP(cond, gfp) ({ \ static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ \ if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \ __warned = true; \ WARN_ON(1); \ } \ unlikely(__ret_warn_once); \ }) void page_writeback_init(void); /* * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. */ #define ENTIRELY_MAPPED 0x800000 #define FOLIO_PAGES_MAPPED (ENTIRELY_MAPPED - 1) /* * Flags passed to __show_mem() and show_free_areas() to suppress output in * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ /* * How many individual pages have an elevated _mapcount. Excludes * the folio's entire_mapcount. * * Don't use this function outside of debugging code. */ static inline int folio_nr_pages_mapped(const struct folio *folio) { if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) return -1; return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; } /* * Retrieve the first entry of a folio based on a provided entry within the * folio. We cannot rely on folio->swap as there is no guarantee that it has * been initialized. Used for calling arch_swap_restore() */ static inline swp_entry_t folio_swap(swp_entry_t entry, const struct folio *folio) { swp_entry_t swap = { .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)), }; return swap; } static inline void *folio_raw_mapping(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; return (void *)(mapping & ~PAGE_MAPPING_FLAGS); } /* * This is a file-backed mapping, and is about to be memory mapped - invoke its * mmap hook and safely handle error conditions. On error, VMA hooks will be * mutated. * * @file: File which backs the mapping. * @vma: VMA which we are mapping. * * Returns: 0 if success, error otherwise. */ static inline int mmap_file(struct file *file, struct vm_area_struct *vma) { int err = call_mmap(file, vma); if (likely(!err)) return 0; /* * OK, we tried to call the file hook for mmap(), but an error * arose. The mapping is in an inconsistent state and we most not invoke * any further hooks on it. */ vma->vm_ops = &vma_dummy_vm_ops; return err; } /* * If the VMA has a close hook then close it, and since closing it might leave * it in an inconsistent state which makes the use of any hooks suspect, clear * them down by installing dummy empty hooks. */ static inline void vma_close(struct vm_area_struct *vma) { if (vma->vm_ops && vma->vm_ops->close) { vma->vm_ops->close(vma); /* * The mapping is in an inconsistent state, and no further hooks * may be invoked upon it. */ vma->vm_ops = &vma_dummy_vm_ops; } } #ifdef CONFIG_MMU /* Flags for folio_pte_batch(). */ typedef int __bitwise fpb_t; /* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ #define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) /* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ #define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) { if (flags & FPB_IGNORE_DIRTY) pte = pte_mkclean(pte); if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) pte = pte_clear_soft_dirty(pte); return pte_wrprotect(pte_mkold(pte)); } /** * folio_pte_batch - detect a PTE batch for a large folio * @folio: The large folio to detect a PTE batch for. * @addr: The user virtual address the first page is mapped at. * @start_ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @max_nr: The maximum number of table entries to consider. * @flags: Flags to modify the PTE batch semantics. * @any_writable: Optional pointer to indicate whether any entry except the * first one is writable. * @any_young: Optional pointer to indicate whether any entry except the * first one is young. * @any_dirty: Optional pointer to indicate whether any entry except the * first one is dirty. * * Detect a PTE batch: consecutive (present) PTEs that map consecutive * pages of the same large folio. * * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). * * start_ptep must map any page of the folio. max_nr must be at least one and * must be limited by the caller so scanning cannot exceed a single page table. * * Return: the number of table entries in the batch. */ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, bool *any_writable, bool *any_young, bool *any_dirty) { pte_t expected_pte, *ptep; bool writable, young, dirty; int nr, cur_nr; if (any_writable) *any_writable = false; if (any_young) *any_young = false; if (any_dirty) *any_dirty = false; VM_WARN_ON_FOLIO(!pte_present(pte), folio); VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ max_nr = min_t(unsigned long, max_nr, folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte)); nr = pte_batch_hint(start_ptep, pte); expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); ptep = start_ptep + nr; while (nr < max_nr) { pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); if (any_young) young = !!pte_young(pte); if (any_dirty) dirty = !!pte_dirty(pte); pte = __pte_batch_clear_ignored(pte, flags); if (!pte_same(pte, expected_pte)) break; if (any_writable) *any_writable |= writable; if (any_young) *any_young |= young; if (any_dirty) *any_dirty |= dirty; cur_nr = pte_batch_hint(ptep, pte); expected_pte = pte_advance_pfn(expected_pte, cur_nr); ptep += cur_nr; nr += cur_nr; } return min(nr, max_nr); } /** * pte_move_swp_offset - Move the swap entry offset field of a swap pte * forward or backward by delta * @pte: The initial pte state; is_swap_pte(pte) must be true and * non_swap_entry() must be false. * @delta: The direction and the offset we are moving; forward if delta * is positive; backward if delta is negative * * Moves the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. */ static inline pte_t pte_move_swp_offset(pte_t pte, long delta) { swp_entry_t entry = pte_to_swp_entry(pte); pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), (swp_offset(entry) + delta))); if (pte_swp_soft_dirty(pte)) new = pte_swp_mksoft_dirty(new); if (pte_swp_exclusive(pte)) new = pte_swp_mkexclusive(new); if (pte_swp_uffd_wp(pte)) new = pte_swp_mkuffd_wp(new); return new; } /** * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. * @pte: The initial pte state; is_swap_pte(pte) must be true and * non_swap_entry() must be false. * * Increments the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. */ static inline pte_t pte_next_swp_offset(pte_t pte) { return pte_move_swp_offset(pte, 1); } /** * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries * @start_ptep: Page table pointer for the first entry. * @max_nr: The maximum number of table entries to consider. * @pte: Page table entry for the first entry. * * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs * containing swap entries all with consecutive offsets and targeting the same * swap type, all with matching swp pte bits. * * max_nr must be at least one and must be limited by the caller so scanning * cannot exceed a single page table. * * Return: the number of table entries in the batch. */ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) { pte_t expected_pte = pte_next_swp_offset(pte); const pte_t *end_ptep = start_ptep + max_nr; swp_entry_t entry = pte_to_swp_entry(pte); pte_t *ptep = start_ptep + 1; unsigned short cgroup_id; VM_WARN_ON(max_nr < 1); VM_WARN_ON(!is_swap_pte(pte)); VM_WARN_ON(non_swap_entry(entry)); cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id) break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; } return ptep - start_ptep; } #endif /* CONFIG_MMU */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled); static inline void acct_reclaim_writeback(struct folio *folio) { pg_data_t *pgdat = folio_pgdat(folio); int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled); if (nr_throttled) __acct_reclaim_writeback(pgdat, folio, nr_throttled); } static inline void wake_throttle_isolated(pg_data_t *pgdat) { wait_queue_head_t *wqh; wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED]; if (waitqueue_active(wqh)) wake_up(wqh); } vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf); static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { vm_fault_t ret = __vmf_anon_prepare(vmf); if (unlikely(ret & VM_FAULT_RETRY)) vma_end_read(vmf->vma); return ret; } vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp); void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, unsigned int order); void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) { DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index); force_page_cache_ra(&ractl, nr_to_read); } unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); long mapping_evict_folio(struct address_space *mapping, struct folio *folio); unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed); /** * folio_evictable - Test whether a folio is evictable. * @folio: The folio to test. * * Test whether @folio is evictable -- i.e., should be placed on * active/inactive lists vs unevictable list. * * Reasons folio might not be evictable: * 1. folio's mapping marked unevictable * 2. One of the pages in the folio is part of an mlocked VMA */ static inline bool folio_evictable(struct folio *folio) { bool ret; /* Prevent address_space of inode and swap cache from being freed */ rcu_read_lock(); ret = !mapping_unevictable(folio_mapping(folio)) && !folio_test_mlocked(folio); rcu_read_unlock(); return ret; } /* * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(page_ref_count(page), page); set_page_count(page, 1); } /* * Return true if a folio needs ->release_folio() calling upon it. */ static inline bool folio_needs_release(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); return folio_has_private(folio) || (mapping && mapping_release_always(mapping)); } extern unsigned long highest_memmap_pfn; /* * Maximum number of reclaim retries without progress before the OOM * killer is consider the only way forward. */ #define MAX_RECLAIM_RETRIES 16 /* * in mm/vmscan.c: */ bool folio_isolate_lru(struct folio *folio); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); /* * in mm/rmap.c: */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c */ #define K(x) ((x) << (PAGE_SHIFT-10)) extern char * const zone_names[MAX_NR_ZONES]; /* perform sanity checks on struct pages being allocated or freed */ DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); extern int min_free_kbytes; extern int defrag_mode; void setup_per_zone_wmarks(void); void calculate_min_free_kbytes(void); int __meminit init_per_zone_wmark_min(void); void page_alloc_sysctl_init(void); /* * Structure for holding the mostly immutable allocation parameters passed * between functions involved in allocations, including the alloc_pages* * family of functions. * * nodemask, migratetype and highest_zoneidx are initialized only once in * __alloc_pages() and then never change. * * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; struct zoneref *preferred_zoneref; int migratetype; /* * highest_zoneidx represents highest usable zone index of * the allocation request. Due to the nature of the zone, * memory on lower zone than the highest_zoneidx will be * protected by lowmem_reserve[highest_zoneidx]. * * highest_zoneidx is also used by reclaim/compaction to limit * the target zone since higher zone than this index cannot be * usable for this allocation request. */ enum zone_type highest_zoneidx; bool spread_dirty_pages; }; /* * This function returns the order of a free page in the buddy system. In * general, page_zone(page)->lock must be held by the caller to prevent the * page from being allocated in parallel and returning garbage as the order. * If a caller does not hold page_zone(page)->lock, it must guarantee that the * page cannot be allocated or merged in parallel. Alternatively, it must * handle invalid values gracefully, and use buddy_order_unsafe() below. */ static inline unsigned int buddy_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); } /* * Like buddy_order(), but for callers who cannot afford to hold the zone lock. * PageBuddy() should be checked first by the caller to minimize race window, * and invalid values must be handled gracefully. * * READ_ONCE is used so that if the caller assigns the result into a local * variable and e.g. tests it for valid range before using, the compiler cannot * decide to remove the variable and inline the page_private(page) multiple * times, potentially observing different values in the tests and the actual * use of the result. */ #define buddy_order_unsafe(page) READ_ONCE(page_private(page)) /* * This function checks whether a page is free && is the buddy * we can coalesce a page and its buddy if * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * * For recording whether a page is in the buddy system, we set PageBuddy. * Setting, clearing, and testing PageBuddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ static inline bool page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { if (!page_is_guard(buddy) && !PageBuddy(buddy)) return false; if (buddy_order(buddy) != order) return false; /* * zone check is done late to avoid uselessly calculating * zone/node ids for pages that could never merge. */ if (page_zone_id(page) != page_zone_id(buddy)) return false; VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); return true; } /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). * * 1) Any buddy B1 will have an order O twin B2 which satisfies * the following equation: * B2 = B1 ^ (1 << O) * For example, if the starting buddy (buddy2) is #8 its order * 1 buddy is #10: * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 * * 2) Any buddy B will have an order O+1 parent P which * satisfies the following equation: * P = B & ~(1 << O) * * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER */ static inline unsigned long __find_buddy_pfn(unsigned long page_pfn, unsigned int order) { return page_pfn ^ (1 << order); } /* * Find the buddy of @page and validate it. * @page: The input page * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the * function is used in the performance-critical __free_one_page(). * @order: The order of the page * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to * page_to_pfn(). * * The found buddy can be a non PageBuddy, out of @page's zone, or its order is * not the same as @page. The validation is necessary before use it. * * Return: the found buddy page or NULL if not found. */ static inline struct page *find_buddy_page_pfn(struct page *page, unsigned long pfn, unsigned int order, unsigned long *buddy_pfn) { unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order); struct page *buddy; buddy = page + (__buddy_pfn - pfn); if (buddy_pfn) *buddy_pfn = __buddy_pfn; if (page_is_buddy(page, buddy, order)) return buddy; return NULL; } extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone); static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone) { if (zone->contiguous) return pfn_to_page(start_pfn); return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); } void set_zone_contiguous(struct zone *zone); bool pfn_range_intersects_zones(int nid, unsigned long start_pfn, unsigned long nr_pages); static inline void clear_zone_contiguous(struct zone *zone) { zone->contiguous = false; } extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order, enum meminit_context context); /* * This will have no effect, other than possibly generating a warning, if the * caller passes in a non-large folio. */ static inline void folio_set_order(struct folio *folio, unsigned int order) { if (WARN_ON_ONCE(!order || !folio_test_large(folio))) return; folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order; #ifdef NR_PAGES_IN_LARGE_FOLIO folio->_nr_pages = 1U << order; #endif } bool __folio_unqueue_deferred_split(struct folio *folio); static inline bool folio_unqueue_deferred_split(struct folio *folio) { if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio)) return false; /* * At this point, there is no one trying to add the folio to * deferred_list. If folio is not in deferred_list, it's safe * to check without acquiring the split_queue_lock. */ if (data_race(list_empty(&folio->_deferred_list))) return false; return __folio_unqueue_deferred_split(folio); } static inline struct folio *page_rmappable_folio(struct page *page) { struct folio *folio = (struct folio *)page; if (folio && folio_test_large(folio)) folio_set_large_rmappable(folio); return folio; } static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; folio_set_order(folio, order); atomic_set(&folio->_large_mapcount, -1); if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) atomic_set(&folio->_nr_pages_mapped, 0); if (IS_ENABLED(CONFIG_MM_ID)) { folio->_mm_ids = 0; folio->_mm_id_mapcount[0] = -1; folio->_mm_id_mapcount[1] = -1; } if (IS_ENABLED(CONFIG_64BIT) || order > 1) { atomic_set(&folio->_pincount, 0); atomic_set(&folio->_entire_mapcount, -1); } if (order > 1) INIT_LIST_HEAD(&folio->_deferred_list); } static inline void prep_compound_tail(struct page *head, int tail_idx) { struct page *p = head + tail_idx; p->mapping = TAIL_MAPPING; set_compound_head(p, head); set_page_private(p, 0); } void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid, nodemask_t *); #define __alloc_frozen_pages(...) \ alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__)) void free_frozen_pages(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); #ifdef CONFIG_NUMA struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order); #else static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order) { return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL); } #endif #define alloc_frozen_pages(...) \ alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); extern void zone_pcp_init(struct zone *zone); extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int); #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* * in mm/compaction.c */ /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts * at the end of a zone and migrate_pfn begins at the start. Movable pages * are moved to the end of a zone during a compaction run and the run * completes when free_pfn <= migrate_pfn */ struct compact_control { struct list_head freepages[NR_PAGE_ORDERS]; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ /* * Acts as an in/out parameter to page isolation for migration. * isolate_migratepages uses it as a search base. * isolate_migratepages_block will update the value to the next pfn * after the last isolated one. */ unsigned long migrate_pfn; unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; unsigned long total_migrate_scanned; unsigned long total_free_scanned; unsigned short fast_search_fail;/* failures to use free list searches */ short search_order; /* order to start a fast search at */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int highest_zoneidx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool no_set_skip_hint; /* Don't mark blocks for skipping */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock contention */ bool finish_pageblock; /* Scan the remainder of a pageblock. Used * when there are potentially transient * isolation or migration failures to * ensure forward progress. */ bool alloc_contig; /* alloc_contig_range allocation */ }; /* * Used in direct compaction when a page should be taken from the freelists * immediately when one is created during the free path. */ struct capture_control { struct compact_control *cc; struct page *page; }; unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); #endif /* CONFIG_COMPACTION || CONFIG_CMA */ struct cma; #ifdef CONFIG_CMA void *cma_reserve_early(struct cma *cma, unsigned long size); void init_cma_pageblock(struct page *page); #else static inline void *cma_reserve_early(struct cma *cma, unsigned long size) { return NULL; } static inline void init_cma_pageblock(struct page *page) { } #endif int find_suitable_fallback(struct free_area *area, unsigned int order, int migratetype, bool claim_only, bool *claim_block); static inline bool free_area_empty(struct free_area *area, int migratetype) { return list_empty(&area->free_list[migratetype]); } /* mm/util.c */ struct anon_vma *folio_anon_vma(const struct folio *folio); #ifdef CONFIG_MMU void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); /* * NOTE: This function can't tell whether the folio is "fully mapped" in the * range. * "fully mapped" means all the pages of folio is associated with the page * table of range while this function just check whether the folio range is * within the range [start, end). Function caller needs to do page table * check if it cares about the page table association. * * Typical usage (like mlock or madvise) is: * Caller knows at least 1 page of folio is associated with page table of VMA * and the range [start, end) is intersect with the VMA range. Caller wants * to know whether the folio is fully associated with the range. It calls * this function to check whether the folio is in the range first. Then checks * the page table to know whether the folio is fully mapped to the range. */ static inline bool folio_within_range(struct folio *folio, struct vm_area_struct *vma, unsigned long start, unsigned long end) { pgoff_t pgoff, addr; unsigned long vma_pglen = vma_pages(vma); VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio); if (start > end) return false; if (start < vma->vm_start) start = vma->vm_start; if (end > vma->vm_end) end = vma->vm_end; pgoff = folio_pgoff(folio); /* if folio start address is not in vma range */ if (!in_range(pgoff, vma->vm_pgoff, vma_pglen)) return false; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); return !(addr < start || end - addr < folio_size(folio)); } static inline bool folio_within_vma(struct folio *folio, struct vm_area_struct *vma) { return folio_within_range(folio, vma, vma->vm_start, vma->vm_end); } /* * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at * the end of folio_remove_rmap_*(); but new anon folios are managed by * folio_add_lru_vma() calling mlock_new_folio(). */ void mlock_folio(struct folio *folio); static inline void mlock_vma_folio(struct folio *folio, struct vm_area_struct *vma) { /* * The VM_SPECIAL check here serves two purposes. * 1) VM_IO check prevents migration from double-counting during mlock. * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED * is never left set on a VM_SPECIAL vma, there is an interval while * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may * still be set while VM_SPECIAL bits are added: so ignore it then. */ if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED)) mlock_folio(folio); } void munlock_folio(struct folio *folio); static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma) { /* * munlock if the function is called. Ideally, we should only * do munlock if any page of folio is unmapped from VMA and * cause folio not fully mapped to VMA. * * But it's not easy to confirm that's the situation. So we * always munlock the folio and page reclaim will correct it * if it's wrong. */ if (unlikely(vma->vm_flags & VM_LOCKED)) munlock_folio(folio); } void mlock_new_folio(struct folio *folio); bool need_mlock_drain(int cpu); void mlock_drain_local(void); void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /** * vma_address - Find the virtual address a page range is mapped at * @vma: The vma which maps this object. * @pgoff: The page offset within its object. * @nr_pages: The number of pages to consider. * * If any page in this range is mapped by this VMA, return the first address * where any of these pages appear. Otherwise, return -EFAULT. */ static inline unsigned long vma_address(const struct vm_area_struct *vma, pgoff_t pgoff, unsigned long nr_pages) { unsigned long address; if (pgoff >= vma->vm_pgoff) { address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address >= vma->vm_end) address = -EFAULT; } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) { /* Test above avoids possibility of wrap to 0 on 32-bit */ address = vma->vm_start; } else { address = -EFAULT; } return address; } /* * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. */ static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw) { struct vm_area_struct *vma = pvmw->vma; pgoff_t pgoff; unsigned long address; /* Common case, plus ->pgoff is invalid for KSM */ if (pvmw->nr_pages == 1) return pvmw->address + PAGE_SIZE; pgoff = pvmw->pgoff + pvmw->nr_pages; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address > vma->vm_end) address = vma->vm_end; return address; } static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, struct file *fpin) { int flags = vmf->flags; if (fpin) return fpin; /* * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or * anything, so we only pin the file and drop the mmap_lock if only * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt. */ if (fault_flag_allow_retry_first(flags) && !(flags & FAULT_FLAG_RETRY_NOWAIT)) { fpin = get_file(vmf->vma->vm_file); release_fault_lock(vmf); } return fpin; } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } static inline void mlock_new_folio(struct folio *folio) { } static inline bool need_mlock_drain(int cpu) { return false; } static inline void mlock_drain_local(void) { } static inline void mlock_drain_remote(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } #endif /* !CONFIG_MMU */ /* Memory initialisation debug and verification */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT DECLARE_STATIC_KEY_TRUE(deferred_pages); bool __init deferred_grow_zone(struct zone *zone, unsigned int order); #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ enum mminit_level { MMINIT_WARNING, MMINIT_VERIFY, MMINIT_TRACE }; #ifdef CONFIG_DEBUG_MEMORY_INIT extern int mminit_loglevel; #define mminit_dprintk(level, prefix, fmt, arg...) \ do { \ if (level < mminit_loglevel) { \ if (level <= MMINIT_WARNING) \ pr_warn("mminit::" prefix " " fmt, ##arg); \ else \ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ } \ } while (0) extern void mminit_verify_pageflags_layout(void); extern void mminit_verify_zonelist(void); #else static inline void mminit_dprintk(enum mminit_level level, const char *prefix, const char *fmt, ...) { } static inline void mminit_verify_pageflags_layout(void) { } static inline void mminit_verify_zonelist(void) { } #endif /* CONFIG_DEBUG_MEMORY_INIT */ #define NODE_RECLAIM_NOSCAN -2 #define NODE_RECLAIM_FULL -1 #define NODE_RECLAIM_SOME 0 #define NODE_RECLAIM_SUCCESS 1 #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else #define node_reclaim_mode 0 static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { return NODE_RECLAIM_NOSCAN; } static inline int find_next_best_node(int node, nodemask_t *used_node_mask) { return NUMA_NO_NODE; } #endif static inline bool node_reclaim_enabled(void) { /* Is any node_reclaim_mode bit set? */ return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); } /* * mm/memory-failure.c */ #ifdef CONFIG_MEMORY_FAILURE int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill); void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; #define MAGIC_HWPOISON 0x48575053U /* HWPS */ void SetPageHWPoisonTakenOff(struct page *page); void ClearPageHWPoisonTakenOff(struct page *page); bool take_page_off_buddy(struct page *page); bool put_page_back_buddy(struct page *page); struct task_struct *task_early_kill(struct task_struct *tsk, int force_early); void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long ksm_addr); unsigned long page_mapped_in_vma(const struct page *page, struct vm_area_struct *vma); #else static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { return -EBUSY; } #endif extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); struct folio *alloc_migrate_folio(struct folio *src, unsigned long private); unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ #define ALLOC_WMARK_MIN WMARK_MIN #define ALLOC_WMARK_LOW WMARK_LOW #define ALLOC_WMARK_HIGH WMARK_HIGH #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ /* Mask to get the watermark bits */ #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) /* * Only MMU archs have async oom victim reclaim - aka oom_reaper so we * cannot assume a reduced access to memory reserves is sufficient for * !MMU */ #ifdef CONFIG_MMU #define ALLOC_OOM 0x08 #else #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif #define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access * to 25% of the min watermark or * 62.5% if __GFP_HIGH is set. */ #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% * of the min watermark. */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #ifdef CONFIG_ZONE_DMA32 #define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */ #else #define ALLOC_NOFRAGMENT 0x0 #endif #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) enum ttu_flags; struct tlbflush_unmap_batch; /* * only for MM internal work items which do not depend on * any allocations or locks which might depend on allocations */ extern struct workqueue_struct *mm_percpu_wq; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); void flush_tlb_batched_pending(struct mm_struct *mm); #else static inline void try_to_unmap_flush(void) { } static inline void try_to_unmap_flush_dirty(void) { } static inline void flush_tlb_batched_pending(struct mm_struct *mm) { } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; static inline bool is_migrate_highatomic(enum migratetype migratetype) { return migratetype == MIGRATE_HIGHATOMIC; } void setup_zone_pageset(struct zone *zone); struct migration_target_control { int nid; /* preferred node id */ nodemask_t *nmask; gfp_t gfp_mask; enum migrate_reason reason; }; /* * mm/filemap.c */ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, struct folio *folio, loff_t fpos, size_t size); /* * mm/vmalloc.c */ #ifdef CONFIG_MMU void __init vmalloc_init(void); int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); unsigned int get_vm_area_page_order(struct vm_struct *vm); #else static inline void vmalloc_init(void) { } static inline int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { return -EINVAL; } #endif int __must_check __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int *flags, bool writable, int *last_cpupid); void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_folio(struct folio *folio); struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller); /* * mm/gup.c */ int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags); /* * mm/huge_memory.c */ void touch_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, bool write); void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); /* * Parses a string with mem suffixes into its order. Useful to parse kernel * parameters. */ static inline int get_order_from_str(const char *size_str, unsigned long valid_orders) { unsigned long size; char *endptr; int order; size = memparse(size_str, &endptr); if (!is_power_of_2(size)) return -EINVAL; order = get_order(size); if (BIT(order) & ~valid_orders) return -EINVAL; return order; } enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, /* a retry, previous pass started an IO */ FOLL_TRIED = 1 << 17, /* we are working on non-current tsk/mm */ FOLL_REMOTE = 1 << 18, /* pages must be released via unpin_user_page */ FOLL_PIN = 1 << 19, /* gup_fast: prevent fall-back to slow gup */ FOLL_FAST_ONLY = 1 << 20, /* allow unlocking the mmap lock */ FOLL_UNLOCKABLE = 1 << 21, /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */ FOLL_MADV_POPULATE = 1 << 22, }; #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \ FOLL_MADV_POPULATE) /* * Indicates for which pages that are write-protected in the page table, * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the * GUP pin will remain consistent with the pages mapped into the page tables * of the MM. * * Temporary unmapping of PageAnonExclusive() pages or clearing of * PageAnonExclusive() has to protect against concurrent GUP: * * Ordinary GUP: Using the PT lock * * GUP-fast and fork(): mm->write_protect_seq * * GUP-fast and KSM or temporary unmapping (swap, migration): see * folio_try_share_anon_rmap_*() * * Must be called with the (sub)page that's actually referenced via the * page table entry, which might not necessarily be the head page for a * PTE-mapped THP. * * If the vma is NULL, we're coming from the GUP-fast path and might have * to fallback to the slow path just to lookup the vma. */ static inline bool gup_must_unshare(struct vm_area_struct *vma, unsigned int flags, struct page *page) { /* * FOLL_WRITE is implicitly handled correctly as the page table entry * has to be writable -- and if it references (part of) an anonymous * folio, that part is required to be marked exclusive. */ if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) return false; /* * Note: PageAnon(page) is stable until the page is actually getting * freed. */ if (!PageAnon(page)) { /* * We only care about R/O long-term pining: R/O short-term * pinning does not have the semantics to observe successive * changes through the process page tables. */ if (!(flags & FOLL_LONGTERM)) return false; /* We really need the vma ... */ if (!vma) return true; /* * ... because we only care about writable private ("COW") * mappings where we have to break COW early. */ return is_cow_mapping(vma->vm_flags); } /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_rmb(); /* * Note that KSM pages cannot be exclusive, and consequently, * cannot get pinned. */ return !PageAnonExclusive(page); } extern bool mirrored_kernelcore; bool memblock_has_mirror(void); void memblock_free_all(void); static __always_inline void vma_set_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; } static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) { /* * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty * enablements, because when without soft-dirty being compiled in, * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY) * will be constantly true. */ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return false; /* * Soft-dirty is kind of special: its tracking is enabled when the * vma flags not set. */ return !(vma->vm_flags & VM_SOFTDIRTY); } static inline bool pmd_needs_soft_dirty_wp(struct vm_area_struct *vma, pmd_t pmd) { return vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd); } static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte) { return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte); } void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid); void __meminit __init_page_from_nid(unsigned long pfn, int nid); /* shrinker related functions */ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) { shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); return shrinker->name ? 0 : -ENOMEM; } static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) { kfree_const(shrinker->name); shrinker->name = NULL; } extern int shrinker_debugfs_add(struct shrinker *shrinker); extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id); extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id); #else /* CONFIG_SHRINKER_DEBUG */ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { return 0; } static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const char *fmt, va_list ap) { return 0; } static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) { } static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id) { *debugfs_id = -1; return NULL; } static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id) { } #endif /* CONFIG_SHRINKER_DEBUG */ /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; #define mapping_set_update(xas, mapping) do { \ if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ xas_set_update(xas, workingset_update_node); \ xas_set_lru(xas, &shadow_nodes); \ } \ } while (0) /* mremap.c */ unsigned long move_page_tables(struct pagetable_move_control *pmc); #ifdef CONFIG_UNACCEPTED_MEMORY void accept_page(struct page *page); #else /* CONFIG_UNACCEPTED_MEMORY */ static inline void accept_page(struct page *page) { } #endif /* CONFIG_UNACCEPTED_MEMORY */ /* pagewalk.c */ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); /* pt_reclaim.c */ bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, pmd_t pmdval); void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, struct mmu_gather *tlb); #ifdef CONFIG_PT_RECLAIM bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, struct zap_details *details); #else static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, struct zap_details *details) { return false; } #endif /* CONFIG_PT_RECLAIM */ #endif /* __MM_INTERNAL_H */ |
2 2 1 26 26 2 21 21 2 10 2 20 12 9 6 3 3 30 28 28 2 2 5 5 5 5 1 1 1 10 2 11 65 65 220 220 86 86 86 86 86 86 65 65 65 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Lauro Ramos Venancio <lauro.venancio@openbossa.org> * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/rfkill.h> #include <linux/nfc.h> #include <net/genetlink.h> #include "nfc.h" #define VERSION "0.1" #define NFC_CHECK_PRES_FREQ_MS 2000 int nfc_devlist_generation; DEFINE_MUTEX(nfc_devlist_mutex); /* NFC device ID bitmap */ static DEFINE_IDA(nfc_index_ida); int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name) { int rc = 0; pr_debug("%s do firmware %s\n", dev_name(&dev->dev), firmware_name); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->dev_up) { rc = -EBUSY; goto error; } if (!dev->ops->fw_download) { rc = -EOPNOTSUPP; goto error; } dev->fw_download_in_progress = true; rc = dev->ops->fw_download(dev, firmware_name); if (rc) dev->fw_download_in_progress = false; error: device_unlock(&dev->dev); return rc; } /** * nfc_fw_download_done - inform that a firmware download was completed * * @dev: The nfc device to which firmware was downloaded * @firmware_name: The firmware filename * @result: The positive value of a standard errno value */ int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name, u32 result) { dev->fw_download_in_progress = false; return nfc_genl_fw_download_done(dev, firmware_name, result); } EXPORT_SYMBOL(nfc_fw_download_done); /** * nfc_dev_up - turn on the NFC device * * @dev: The nfc device to be turned on * * The device remains up until the nfc_dev_down function is called. */ int nfc_dev_up(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->rfkill && rfkill_blocked(dev->rfkill)) { rc = -ERFKILL; goto error; } if (dev->fw_download_in_progress) { rc = -EBUSY; goto error; } if (dev->dev_up) { rc = -EALREADY; goto error; } if (dev->ops->dev_up) rc = dev->ops->dev_up(dev); if (!rc) dev->dev_up = true; /* We have to enable the device before discovering SEs */ if (dev->ops->discover_se && dev->ops->discover_se(dev)) pr_err("SE discovery failed\n"); error: device_unlock(&dev->dev); return rc; } /** * nfc_dev_down - turn off the NFC device * * @dev: The nfc device to be turned off */ int nfc_dev_down(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -EALREADY; goto error; } if (dev->polling || dev->active_target) { rc = -EBUSY; goto error; } if (dev->ops->dev_down) dev->ops->dev_down(dev); dev->dev_up = false; error: device_unlock(&dev->dev); return rc; } static int nfc_rfkill_set_block(void *data, bool blocked) { struct nfc_dev *dev = data; pr_debug("%s blocked %d", dev_name(&dev->dev), blocked); if (!blocked) return 0; nfc_dev_down(dev); return 0; } static const struct rfkill_ops nfc_rfkill_ops = { .set_block = nfc_rfkill_set_block, }; /** * nfc_start_poll - start polling for nfc targets * * @dev: The nfc device that must start polling * @im_protocols: bitset of nfc initiator protocols to be used for polling * @tm_protocols: bitset of nfc transport protocols to be used for polling * * The device remains polling for targets until a target is found or * the nfc_stop_poll function is called. */ int nfc_start_poll(struct nfc_dev *dev, u32 im_protocols, u32 tm_protocols) { int rc; pr_debug("dev_name %s initiator protocols 0x%x target protocols 0x%x\n", dev_name(&dev->dev), im_protocols, tm_protocols); if (!im_protocols && !tm_protocols) return -EINVAL; device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (dev->polling) { rc = -EBUSY; goto error; } rc = dev->ops->start_poll(dev, im_protocols, tm_protocols); if (!rc) { dev->polling = true; dev->rf_mode = NFC_RF_NONE; } error: device_unlock(&dev->dev); return rc; } /** * nfc_stop_poll - stop polling for nfc targets * * @dev: The nfc device that must stop polling */ int nfc_stop_poll(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->polling) { rc = -EINVAL; goto error; } dev->ops->stop_poll(dev); dev->polling = false; dev->rf_mode = NFC_RF_NONE; error: device_unlock(&dev->dev); return rc; } static struct nfc_target *nfc_find_target(struct nfc_dev *dev, u32 target_idx) { int i; for (i = 0; i < dev->n_targets; i++) { if (dev->targets[i].idx == target_idx) return &dev->targets[i]; } return NULL; } int nfc_dep_link_up(struct nfc_dev *dev, int target_index, u8 comm_mode) { int rc = 0; u8 *gb; size_t gb_len; struct nfc_target *target; pr_debug("dev_name=%s comm %d\n", dev_name(&dev->dev), comm_mode); if (!dev->ops->dep_link_up) return -EOPNOTSUPP; device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->dep_link_up == true) { rc = -EALREADY; goto error; } gb = nfc_llcp_general_bytes(dev, &gb_len); if (gb_len > NFC_MAX_GT_LEN) { rc = -EINVAL; goto error; } target = nfc_find_target(dev, target_index); if (target == NULL) { rc = -ENOTCONN; goto error; } rc = dev->ops->dep_link_up(dev, target, comm_mode, gb, gb_len); if (!rc) { dev->active_target = target; dev->rf_mode = NFC_RF_INITIATOR; } error: device_unlock(&dev->dev); return rc; } int nfc_dep_link_down(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); if (!dev->ops->dep_link_down) return -EOPNOTSUPP; device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->dep_link_up == false) { rc = -EALREADY; goto error; } rc = dev->ops->dep_link_down(dev); if (!rc) { dev->dep_link_up = false; dev->active_target = NULL; dev->rf_mode = NFC_RF_NONE; nfc_llcp_mac_is_down(dev); nfc_genl_dep_link_down_event(dev); } error: device_unlock(&dev->dev); return rc; } int nfc_dep_link_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { dev->dep_link_up = true; if (!dev->active_target && rf_mode == NFC_RF_INITIATOR) { struct nfc_target *target; target = nfc_find_target(dev, target_idx); if (target == NULL) return -ENOTCONN; dev->active_target = target; } dev->polling = false; dev->rf_mode = rf_mode; nfc_llcp_mac_is_up(dev, target_idx, comm_mode, rf_mode); return nfc_genl_dep_link_up_event(dev, target_idx, comm_mode, rf_mode); } EXPORT_SYMBOL(nfc_dep_link_is_up); /** * nfc_activate_target - prepare the target for data exchange * * @dev: The nfc device that found the target * @target_idx: index of the target that must be activated * @protocol: nfc protocol that will be used for data exchange */ int nfc_activate_target(struct nfc_dev *dev, u32 target_idx, u32 protocol) { int rc; struct nfc_target *target; pr_debug("dev_name=%s target_idx=%u protocol=%u\n", dev_name(&dev->dev), target_idx, protocol); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->active_target) { rc = -EBUSY; goto error; } target = nfc_find_target(dev, target_idx); if (target == NULL) { rc = -ENOTCONN; goto error; } rc = dev->ops->activate_target(dev, target, protocol); if (!rc) { dev->active_target = target; dev->rf_mode = NFC_RF_INITIATOR; if (dev->ops->check_presence && !dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } error: device_unlock(&dev->dev); return rc; } /** * nfc_deactivate_target - deactivate a nfc target * * @dev: The nfc device that found the target * @target_idx: index of the target that must be deactivated * @mode: idle or sleep? */ int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx, u8 mode) { int rc = 0; pr_debug("dev_name=%s target_idx=%u\n", dev_name(&dev->dev), target_idx); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->active_target == NULL) { rc = -ENOTCONN; goto error; } if (dev->active_target->idx != target_idx) { rc = -ENOTCONN; goto error; } if (dev->ops->check_presence) timer_delete_sync(&dev->check_pres_timer); dev->ops->deactivate_target(dev, dev->active_target, mode); dev->active_target = NULL; error: device_unlock(&dev->dev); return rc; } /** * nfc_data_exchange - transceive data * * @dev: The nfc device that found the target * @target_idx: index of the target * @skb: data to be sent * @cb: callback called when the response is received * @cb_context: parameter for the callback function * * The user must wait for the callback before calling this function again. */ int nfc_data_exchange(struct nfc_dev *dev, u32 target_idx, struct sk_buff *skb, data_exchange_cb_t cb, void *cb_context) { int rc; pr_debug("dev_name=%s target_idx=%u skb->len=%u\n", dev_name(&dev->dev), target_idx, skb->len); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; kfree_skb(skb); goto error; } if (dev->rf_mode == NFC_RF_INITIATOR && dev->active_target != NULL) { if (dev->active_target->idx != target_idx) { rc = -EADDRNOTAVAIL; kfree_skb(skb); goto error; } if (dev->ops->check_presence) timer_delete_sync(&dev->check_pres_timer); rc = dev->ops->im_transceive(dev, dev->active_target, skb, cb, cb_context); if (!rc && dev->ops->check_presence && !dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } else if (dev->rf_mode == NFC_RF_TARGET && dev->ops->tm_send != NULL) { rc = dev->ops->tm_send(dev, skb); } else { rc = -ENOTCONN; kfree_skb(skb); goto error; } error: device_unlock(&dev->dev); return rc; } struct nfc_se *nfc_find_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se; list_for_each_entry(se, &dev->secure_elements, list) if (se->idx == se_idx) return se; return NULL; } EXPORT_SYMBOL(nfc_find_se); int nfc_enable_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (dev->polling) { rc = -EBUSY; goto error; } if (!dev->ops->enable_se || !dev->ops->disable_se) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state == NFC_SE_ENABLED) { rc = -EALREADY; goto error; } rc = dev->ops->enable_se(dev, se_idx); if (rc >= 0) se->state = NFC_SE_ENABLED; error: device_unlock(&dev->dev); return rc; } int nfc_disable_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (!dev->ops->enable_se || !dev->ops->disable_se) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state == NFC_SE_DISABLED) { rc = -EALREADY; goto error; } rc = dev->ops->disable_se(dev, se_idx); if (rc >= 0) se->state = NFC_SE_DISABLED; error: device_unlock(&dev->dev); return rc; } int nfc_set_remote_general_bytes(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { pr_debug("dev_name=%s gb_len=%d\n", dev_name(&dev->dev), gb_len); return nfc_llcp_set_remote_gb(dev, gb, gb_len); } EXPORT_SYMBOL(nfc_set_remote_general_bytes); u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len) { pr_debug("dev_name=%s\n", dev_name(&dev->dev)); return nfc_llcp_general_bytes(dev, gb_len); } EXPORT_SYMBOL(nfc_get_local_general_bytes); int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb) { /* Only LLCP target mode for now */ if (dev->dep_link_up == false) { kfree_skb(skb); return -ENOLINK; } return nfc_llcp_data_received(dev, skb); } EXPORT_SYMBOL(nfc_tm_data_received); int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode, const u8 *gb, size_t gb_len) { int rc; device_lock(&dev->dev); dev->polling = false; if (gb != NULL) { rc = nfc_set_remote_general_bytes(dev, gb, gb_len); if (rc < 0) goto out; } dev->rf_mode = NFC_RF_TARGET; if (protocol == NFC_PROTO_NFC_DEP_MASK) nfc_dep_link_is_up(dev, 0, comm_mode, NFC_RF_TARGET); rc = nfc_genl_tm_activated(dev, protocol); out: device_unlock(&dev->dev); return rc; } EXPORT_SYMBOL(nfc_tm_activated); int nfc_tm_deactivated(struct nfc_dev *dev) { dev->dep_link_up = false; dev->rf_mode = NFC_RF_NONE; return nfc_genl_tm_deactivated(dev); } EXPORT_SYMBOL(nfc_tm_deactivated); /** * nfc_alloc_send_skb - allocate a skb for data exchange responses * * @dev: device sending the response * @sk: socket sending the response * @flags: MSG_DONTWAIT flag * @size: size to allocate * @err: pointer to memory to store the error code */ struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk, unsigned int flags, unsigned int size, unsigned int *err) { struct sk_buff *skb; unsigned int total_size; total_size = size + dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE; skb = sock_alloc_send_skb(sk, total_size, flags & MSG_DONTWAIT, err); if (skb) skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE); return skb; } /** * nfc_alloc_recv_skb - allocate a skb for data exchange responses * * @size: size to allocate * @gfp: gfp flags */ struct sk_buff *nfc_alloc_recv_skb(unsigned int size, gfp_t gfp) { struct sk_buff *skb; unsigned int total_size; total_size = size + 1; skb = alloc_skb(total_size, gfp); if (skb) skb_reserve(skb, 1); return skb; } EXPORT_SYMBOL(nfc_alloc_recv_skb); /** * nfc_targets_found - inform that targets were found * * @dev: The nfc device that found the targets * @targets: array of nfc targets found * @n_targets: targets array size * * The device driver must call this function when one or many nfc targets * are found. After calling this function, the device driver must stop * polling for targets. * NOTE: This function can be called with targets=NULL and n_targets=0 to * notify a driver error, meaning that the polling operation cannot complete. * IMPORTANT: this function must not be called from an atomic context. * In addition, it must also not be called from a context that would prevent * the NFC Core to call other nfc ops entry point concurrently. */ int nfc_targets_found(struct nfc_dev *dev, struct nfc_target *targets, int n_targets) { int i; pr_debug("dev_name=%s n_targets=%d\n", dev_name(&dev->dev), n_targets); for (i = 0; i < n_targets; i++) targets[i].idx = dev->target_next_idx++; device_lock(&dev->dev); if (dev->polling == false) { device_unlock(&dev->dev); return 0; } dev->polling = false; dev->targets_generation++; kfree(dev->targets); dev->targets = NULL; if (targets) { dev->targets = kmemdup(targets, n_targets * sizeof(struct nfc_target), GFP_ATOMIC); if (!dev->targets) { dev->n_targets = 0; device_unlock(&dev->dev); return -ENOMEM; } } dev->n_targets = n_targets; device_unlock(&dev->dev); nfc_genl_targets_found(dev); return 0; } EXPORT_SYMBOL(nfc_targets_found); /** * nfc_target_lost - inform that an activated target went out of field * * @dev: The nfc device that had the activated target in field * @target_idx: the nfc index of the target * * The device driver must call this function when the activated target * goes out of the field. * IMPORTANT: this function must not be called from an atomic context. * In addition, it must also not be called from a context that would prevent * the NFC Core to call other nfc ops entry point concurrently. */ int nfc_target_lost(struct nfc_dev *dev, u32 target_idx) { const struct nfc_target *tg; int i; pr_debug("dev_name %s n_target %d\n", dev_name(&dev->dev), target_idx); device_lock(&dev->dev); for (i = 0; i < dev->n_targets; i++) { tg = &dev->targets[i]; if (tg->idx == target_idx) break; } if (i == dev->n_targets) { device_unlock(&dev->dev); return -EINVAL; } dev->targets_generation++; dev->n_targets--; dev->active_target = NULL; if (dev->n_targets) { memcpy(&dev->targets[i], &dev->targets[i + 1], (dev->n_targets - i) * sizeof(struct nfc_target)); } else { kfree(dev->targets); dev->targets = NULL; } device_unlock(&dev->dev); nfc_genl_target_lost(dev, target_idx); return 0; } EXPORT_SYMBOL(nfc_target_lost); inline void nfc_driver_failure(struct nfc_dev *dev, int err) { nfc_targets_found(dev, NULL, 0); } EXPORT_SYMBOL(nfc_driver_failure); int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); se = nfc_find_se(dev, se_idx); if (se) return -EALREADY; se = kzalloc(sizeof(struct nfc_se), GFP_KERNEL); if (!se) return -ENOMEM; se->idx = se_idx; se->type = type; se->state = NFC_SE_DISABLED; INIT_LIST_HEAD(&se->list); list_add(&se->list, &dev->secure_elements); rc = nfc_genl_se_added(dev, se_idx, type); if (rc < 0) { list_del(&se->list); kfree(se); return rc; } return 0; } EXPORT_SYMBOL(nfc_add_se); int nfc_remove_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se, *n; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); list_for_each_entry_safe(se, n, &dev->secure_elements, list) if (se->idx == se_idx) { rc = nfc_genl_se_removed(dev, se_idx); if (rc < 0) return rc; list_del(&se->list); kfree(se); return 0; } return -EINVAL; } EXPORT_SYMBOL(nfc_remove_se); int nfc_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction) { int rc; pr_debug("transaction: %x\n", se_idx); device_lock(&dev->dev); if (!evt_transaction) { rc = -EPROTO; goto out; } rc = nfc_genl_se_transaction(dev, se_idx, evt_transaction); out: device_unlock(&dev->dev); return rc; } EXPORT_SYMBOL(nfc_se_transaction); int nfc_se_connectivity(struct nfc_dev *dev, u8 se_idx) { int rc; pr_debug("connectivity: %x\n", se_idx); device_lock(&dev->dev); rc = nfc_genl_se_connectivity(dev, se_idx); device_unlock(&dev->dev); return rc; } EXPORT_SYMBOL(nfc_se_connectivity); static void nfc_release(struct device *d) { struct nfc_dev *dev = to_nfc_dev(d); struct nfc_se *se, *n; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); nfc_genl_data_exit(&dev->genl_data); kfree(dev->targets); list_for_each_entry_safe(se, n, &dev->secure_elements, list) { nfc_genl_se_removed(dev, se->idx); list_del(&se->list); kfree(se); } ida_free(&nfc_index_ida, dev->idx); kfree(dev); } static void nfc_check_pres_work(struct work_struct *work) { struct nfc_dev *dev = container_of(work, struct nfc_dev, check_pres_work); int rc; device_lock(&dev->dev); if (dev->active_target && timer_pending(&dev->check_pres_timer) == 0) { rc = dev->ops->check_presence(dev, dev->active_target); if (rc == -EOPNOTSUPP) goto exit; if (rc) { u32 active_target_idx = dev->active_target->idx; device_unlock(&dev->dev); nfc_target_lost(dev, active_target_idx); return; } if (!dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } exit: device_unlock(&dev->dev); } static void nfc_check_pres_timeout(struct timer_list *t) { struct nfc_dev *dev = from_timer(dev, t, check_pres_timer); schedule_work(&dev->check_pres_work); } const struct class nfc_class = { .name = "nfc", .dev_release = nfc_release, }; EXPORT_SYMBOL(nfc_class); static int match_idx(struct device *d, const void *data) { struct nfc_dev *dev = to_nfc_dev(d); const unsigned int *idx = data; return dev->idx == *idx; } struct nfc_dev *nfc_get_device(unsigned int idx) { struct device *d; d = class_find_device(&nfc_class, NULL, &idx, match_idx); if (!d) return NULL; return to_nfc_dev(d); } /** * nfc_allocate_device - allocate a new nfc device * * @ops: device operations * @supported_protocols: NFC protocols supported by the device * @tx_headroom: reserved space at beginning of skb * @tx_tailroom: reserved space at end of skb */ struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops, u32 supported_protocols, int tx_headroom, int tx_tailroom) { struct nfc_dev *dev; int rc; if (!ops->start_poll || !ops->stop_poll || !ops->activate_target || !ops->deactivate_target || !ops->im_transceive) return NULL; if (!supported_protocols) return NULL; dev = kzalloc(sizeof(struct nfc_dev), GFP_KERNEL); if (!dev) return NULL; rc = ida_alloc(&nfc_index_ida, GFP_KERNEL); if (rc < 0) goto err_free_dev; dev->idx = rc; dev->dev.class = &nfc_class; dev_set_name(&dev->dev, "nfc%d", dev->idx); device_initialize(&dev->dev); dev->ops = ops; dev->supported_protocols = supported_protocols; dev->tx_headroom = tx_headroom; dev->tx_tailroom = tx_tailroom; INIT_LIST_HEAD(&dev->secure_elements); nfc_genl_data_init(&dev->genl_data); dev->rf_mode = NFC_RF_NONE; /* first generation must not be 0 */ dev->targets_generation = 1; if (ops->check_presence) { timer_setup(&dev->check_pres_timer, nfc_check_pres_timeout, 0); INIT_WORK(&dev->check_pres_work, nfc_check_pres_work); } return dev; err_free_dev: kfree(dev); return NULL; } EXPORT_SYMBOL(nfc_allocate_device); /** * nfc_register_device - register a nfc device in the nfc subsystem * * @dev: The nfc device to register */ int nfc_register_device(struct nfc_dev *dev) { int rc; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); mutex_lock(&nfc_devlist_mutex); nfc_devlist_generation++; rc = device_add(&dev->dev); mutex_unlock(&nfc_devlist_mutex); if (rc < 0) return rc; rc = nfc_llcp_register_device(dev); if (rc) pr_err("Could not register llcp device\n"); device_lock(&dev->dev); dev->rfkill = rfkill_alloc(dev_name(&dev->dev), &dev->dev, RFKILL_TYPE_NFC, &nfc_rfkill_ops, dev); if (dev->rfkill) { if (rfkill_register(dev->rfkill) < 0) { rfkill_destroy(dev->rfkill); dev->rfkill = NULL; } } dev->shutting_down = false; device_unlock(&dev->dev); rc = nfc_genl_device_added(dev); if (rc) pr_debug("The userspace won't be notified that the device %s was added\n", dev_name(&dev->dev)); return 0; } EXPORT_SYMBOL(nfc_register_device); /** * nfc_unregister_device - unregister a nfc device in the nfc subsystem * * @dev: The nfc device to unregister */ void nfc_unregister_device(struct nfc_dev *dev) { int rc; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); rc = nfc_genl_device_removed(dev); if (rc) pr_debug("The userspace won't be notified that the device %s " "was removed\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->rfkill) { rfkill_unregister(dev->rfkill); rfkill_destroy(dev->rfkill); dev->rfkill = NULL; } dev->shutting_down = true; device_unlock(&dev->dev); if (dev->ops->check_presence) { timer_delete_sync(&dev->check_pres_timer); cancel_work_sync(&dev->check_pres_work); } nfc_llcp_unregister_device(dev); mutex_lock(&nfc_devlist_mutex); nfc_devlist_generation++; device_del(&dev->dev); mutex_unlock(&nfc_devlist_mutex); } EXPORT_SYMBOL(nfc_unregister_device); static int __init nfc_init(void) { int rc; pr_info("NFC Core ver %s\n", VERSION); rc = class_register(&nfc_class); if (rc) return rc; rc = nfc_genl_init(); if (rc) goto err_genl; /* the first generation must not be 0 */ nfc_devlist_generation = 1; rc = rawsock_init(); if (rc) goto err_rawsock; rc = nfc_llcp_init(); if (rc) goto err_llcp_sock; rc = af_nfc_init(); if (rc) goto err_af_nfc; return 0; err_af_nfc: nfc_llcp_exit(); err_llcp_sock: rawsock_exit(); err_rawsock: nfc_genl_exit(); err_genl: class_unregister(&nfc_class); return rc; } static void __exit nfc_exit(void) { af_nfc_exit(); nfc_llcp_exit(); rawsock_exit(); nfc_genl_exit(); class_unregister(&nfc_class); } subsys_initcall(nfc_init); module_exit(nfc_exit); MODULE_AUTHOR("Lauro Ramos Venancio <lauro.venancio@openbossa.org>"); MODULE_DESCRIPTION("NFC Core ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_NFC); MODULE_ALIAS_GENL_FAMILY(NFC_GENL_NAME); |
1 4 7 1 2 5 5 5 4 8 4 15 4 5 9 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Cloudflare Ltd https://cloudflare.com */ #include <linux/skmsg.h> #include <net/sock.h> #include <net/udp.h> #include <net/inet_common.h> #include "udp_impl.h" static struct proto *udpv6_prot_saved __read_mostly; static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return udpv6_prot_saved->recvmsg(sk, msg, len, flags, addr_len); #endif return udp_prot.recvmsg(sk, msg, len, flags, addr_len); } static bool udp_sk_has_data(struct sock *sk) { return !skb_queue_empty(&udp_sk(sk)->reader_queue) || !skb_queue_empty(&sk->sk_receive_queue); } static bool psock_has_data(struct sk_psock *psock) { return !skb_queue_empty(&psock->ingress_skb) || !sk_psock_queue_empty(psock); } #define udp_msg_has_data(__sk, __psock) \ ({ udp_sk_has_data(__sk) || psock_has_data(__psock); }) static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) return 1; if (!timeo) return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = udp_msg_has_data(sk, psock); if (!ret) { wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); ret = udp_msg_has_data(sk, psock); } sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_psock *psock; int copied, ret; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return sk_udp_recvmsg(sk, msg, len, flags, addr_len); if (!psock_has_data(psock)) { ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len); goto out; } msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { long timeo; int data; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = udp_msg_wait_data(sk, psock, timeo); if (data) { if (psock_has_data(psock)) goto msg_bytes_ready; ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len); goto out; } copied = -EAGAIN; } ret = copied; out: sk_psock_put(sk, psock); return ret; } enum { UDP_BPF_IPV4, UDP_BPF_IPV6, UDP_BPF_NUM_PROTS, }; static DEFINE_SPINLOCK(udpv6_prot_lock); static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = udp_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; } static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) { spin_lock_bh(&udpv6_prot_lock); if (likely(ops != udpv6_prot_saved)) { udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops); smp_store_release(&udpv6_prot_saved, ops); } spin_unlock_bh(&udpv6_prot_lock); } } static int __init udp_bpf_v4_build_proto(void) { udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV4], &udp_prot); return 0; } late_initcall(udp_bpf_v4_build_proto); int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &udp_bpf_prots[family]); return 0; } EXPORT_SYMBOL_GPL(udp_bpf_update_proto); |
38 38 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DEBUGREG_H #define _ASM_X86_DEBUGREG_H #include <linux/bug.h> #include <linux/percpu.h> #include <uapi/asm/debugreg.h> #include <asm/cpufeature.h> #include <asm/msr.h> DECLARE_PER_CPU(unsigned long, cpu_dr7); #ifndef CONFIG_PARAVIRT_XXL /* * These special macros can be used to get or set a debugging register */ #define get_debugreg(var, register) \ (var) = native_get_debugreg(register) #define set_debugreg(value, register) \ native_set_debugreg(register, value) #endif static __always_inline unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ switch (regno) { case 0: asm("mov %%db0, %0" :"=r" (val)); break; case 1: asm("mov %%db1, %0" :"=r" (val)); break; case 2: asm("mov %%db2, %0" :"=r" (val)); break; case 3: asm("mov %%db3, %0" :"=r" (val)); break; case 6: asm("mov %%db6, %0" :"=r" (val)); break; case 7: /* * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them * with other code. * * This is needed because a DR7 access can cause a #VC exception * when running under SEV-ES. Taking a #VC exception is not a * safe thing to do just anywhere in the entry code and * re-ordering might place the access into an unsafe location. * * This happened in the NMI handler, where the DR7 read was * re-ordered to happen before the call to sev_es_ist_enter(), * causing stack recursion. */ asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER); break; default: BUG(); } return val; } static __always_inline void native_set_debugreg(int regno, unsigned long value) { switch (regno) { case 0: asm("mov %0, %%db0" ::"r" (value)); break; case 1: asm("mov %0, %%db1" ::"r" (value)); break; case 2: asm("mov %0, %%db2" ::"r" (value)); break; case 3: asm("mov %0, %%db3" ::"r" (value)); break; case 6: asm("mov %0, %%db6" ::"r" (value)); break; case 7: /* * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them * with other code. * * While is didn't happen with a DR7 write (see the DR7 read * comment above which explains where it happened), add the * __FORCE_ORDER here too to avoid similar problems in the * future. */ asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER); break; default: BUG(); } } static inline void hw_breakpoint_disable(void) { /* Zero the control register for HW Breakpoint */ set_debugreg(0UL, 7); /* Zero-out the individual HW breakpoint address registers */ set_debugreg(0UL, 0); set_debugreg(0UL, 1); set_debugreg(0UL, 2); set_debugreg(0UL, 3); } static __always_inline bool hw_breakpoint_active(void) { return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } extern void hw_breakpoint_restore(void); static __always_inline unsigned long local_db_save(void) { unsigned long dr7; if (static_cpu_has(X86_FEATURE_HYPERVISOR) && !hw_breakpoint_active()) return 0; get_debugreg(dr7, 7); dr7 &= ~0x400; /* architecturally set bit */ if (dr7) set_debugreg(0, 7); /* * Ensure the compiler doesn't lower the above statements into * the critical section; disabling breakpoints late would not * be good. */ barrier(); return dr7; } static __always_inline void local_db_restore(unsigned long dr7) { /* * Ensure the compiler doesn't raise this statement into * the critical section; enabling breakpoints early would * not be good. */ barrier(); if (dr7) set_debugreg(dr7, 7); } #ifdef CONFIG_CPU_SUP_AMD extern void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr); extern unsigned long amd_get_dr_addr_mask(unsigned int dr); #else static inline void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { } static inline unsigned long amd_get_dr_addr_mask(unsigned int dr) { return 0; } #endif static inline unsigned long get_debugctlmsr(void) { unsigned long debugctlmsr = 0; #ifndef CONFIG_X86_DEBUGCTLMSR if (boot_cpu_data.x86 < 6) return 0; #endif rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); return debugctlmsr; } static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR if (boot_cpu_data.x86 < 6) return; #endif wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } #endif /* _ASM_X86_DEBUGREG_H */ |
24 24 24 24 24 22 22 22 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | /* * llc_if.c - Defines LLC interface to upper layer * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/gfp.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/errno.h> #include <net/llc_if.h> #include <net/llc_sap.h> #include <net/llc_s_ev.h> #include <net/llc_conn.h> #include <net/sock.h> #include <net/llc_c_ev.h> #include <net/llc_c_ac.h> #include <net/llc_c_st.h> #include <net/tcp_states.h> /** * llc_build_and_send_pkt - Connection data sending for upper layers. * @sk: connection * @skb: packet to send * * This function is called when upper layer wants to send data using * connection oriented communication mode. During sending data, connection * will be locked and received frames and expired timers will be queued. * Returns 0 for success, -ECONNABORTED when the connection already * closed and -EBUSY when sending data is not permitted in this state or * LLC has send an I pdu with p bit set to 1 and is waiting for it's * response. * * This function always consumes a reference to the skb. */ int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev; int rc = -ECONNABORTED; struct llc_sock *llc = llc_sk(sk); if (unlikely(llc->state == LLC_CONN_STATE_ADM)) goto out_free; rc = -EBUSY; if (unlikely(llc_data_accept_state(llc->state) || /* data_conn_refuse */ llc->p_flag)) { llc->failed_data_req = 1; goto out_free; } ev = llc_conn_ev(skb); ev->type = LLC_CONN_EV_TYPE_PRIM; ev->prim = LLC_DATA_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; skb->dev = llc->dev; return llc_conn_state_process(sk, skb); out_free: kfree_skb(skb); return rc; } /** * llc_establish_connection - Called by upper layer to establish a conn * @sk: connection * @lmac: local mac address * @dmac: destination mac address * @dsap: destination sap * * Upper layer calls this to establish an LLC connection with a remote * machine. This function packages a proper event and sends it connection * component state machine. Success or failure of connection * establishment will inform to upper layer via calling it's confirm * function and passing proper information. */ int llc_establish_connection(struct sock *sk, const u8 *lmac, u8 *dmac, u8 dsap) { int rc = -EISCONN; struct llc_addr laddr, daddr; struct sk_buff *skb; struct llc_sock *llc = llc_sk(sk); struct sock *existing; laddr.lsap = llc->sap->laddr.lsap; daddr.lsap = dsap; memcpy(daddr.mac, dmac, sizeof(daddr.mac)); memcpy(laddr.mac, lmac, sizeof(laddr.mac)); existing = llc_lookup_established(llc->sap, &daddr, &laddr, sock_net(sk)); if (existing) { if (existing->sk_state == TCP_ESTABLISHED) { sk = existing; goto out_put; } else sock_put(existing); } sock_hold(sk); rc = -ENOMEM; skb = alloc_skb(0, GFP_ATOMIC); if (skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->type = LLC_CONN_EV_TYPE_PRIM; ev->prim = LLC_CONN_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; skb_set_owner_w(skb, sk); rc = llc_conn_state_process(sk, skb); } out_put: sock_put(sk); return rc; } /** * llc_send_disc - Called by upper layer to close a connection * @sk: connection to be closed * * Upper layer calls this when it wants to close an established LLC * connection with a remote machine. This function packages a proper event * and sends it to connection component state machine. Returns 0 for * success, 1 otherwise. */ int llc_send_disc(struct sock *sk) { u16 rc = 1; struct llc_conn_state_ev *ev; struct sk_buff *skb; sock_hold(sk); if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_ESTABLISHED || llc_sk(sk)->state == LLC_CONN_STATE_ADM || llc_sk(sk)->state == LLC_CONN_OUT_OF_SVC) goto out; /* * Postpone unassigning the connection from its SAP and returning the * connection until all ACTIONs have been completely executed */ skb = alloc_skb(0, GFP_ATOMIC); if (!skb) goto out; skb_set_owner_w(skb, sk); sk->sk_state = TCP_CLOSING; ev = llc_conn_ev(skb); ev->type = LLC_CONN_EV_TYPE_PRIM; ev->prim = LLC_DISC_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; rc = llc_conn_state_process(sk, skb); out: sock_put(sk); return rc; } |
15 15 343 640 640 630 131 93 93 57 8 100 20 20 159 20 200 193 9 134 49 103 135 126 13 50 50 106 106 106 8 20 5 1 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 | /* SPDX-License-Identifier: GPL-1.0+ */ /* * Bond several ethernet interfaces into a Cisco, running 'Etherchannel'. * * Portions are (c) Copyright 1995 Simon "Guru Aleph-Null" Janes * NCM: Network and Communications Management, Inc. * * BUT, I'm the one who modified it for ethernet, so: * (c) Copyright 1999, Thomas Davis, tadavis@lbl.gov * */ #ifndef _NET_BONDING_H #define _NET_BONDING_H #include <linux/timer.h> #include <linux/proc_fs.h> #include <linux/if_bonding.h> #include <linux/cpumask.h> #include <linux/in6.h> #include <linux/netpoll.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/reciprocal_div.h> #include <linux/if_link.h> #include <net/bond_3ad.h> #include <net/bond_alb.h> #include <net/bond_options.h> #include <net/ipv6.h> #include <net/addrconf.h> #define BOND_MAX_ARP_TARGETS 16 #define BOND_MAX_NS_TARGETS BOND_MAX_ARP_TARGETS #define BOND_DEFAULT_MIIMON 100 #ifndef __long_aligned #define __long_aligned __attribute__((aligned((sizeof(long))))) #endif #define slave_info(bond_dev, slave_dev, fmt, ...) \ netdev_info(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_warn(bond_dev, slave_dev, fmt, ...) \ netdev_warn(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_dbg(bond_dev, slave_dev, fmt, ...) \ netdev_dbg(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define slave_err(bond_dev, slave_dev, fmt, ...) \ netdev_err(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__) #define BOND_MODE(bond) ((bond)->params.mode) /* slave list primitives */ #define bond_slave_list(bond) (&(bond)->dev->adj_list.lower) #define bond_has_slaves(bond) !list_empty(bond_slave_list(bond)) /* IMPORTANT: bond_first/last_slave can return NULL in case of an empty list */ #define bond_first_slave(bond) \ (bond_has_slaves(bond) ? \ netdev_adjacent_get_private(bond_slave_list(bond)->next) : \ NULL) #define bond_last_slave(bond) \ (bond_has_slaves(bond) ? \ netdev_adjacent_get_private(bond_slave_list(bond)->prev) : \ NULL) /* Caller must have rcu_read_lock */ #define bond_first_slave_rcu(bond) \ netdev_lower_get_first_private_rcu(bond->dev) #define bond_is_first_slave(bond, pos) (pos == bond_first_slave(bond)) #define bond_is_last_slave(bond, pos) (pos == bond_last_slave(bond)) /** * bond_for_each_slave - iterate over all slaves * @bond: the bond holding this list * @pos: current slave * @iter: list_head * iterator * * Caller must hold RTNL */ #define bond_for_each_slave(bond, pos, iter) \ netdev_for_each_lower_private((bond)->dev, pos, iter) /* Caller must have rcu_read_lock */ #define bond_for_each_slave_rcu(bond, pos, iter) \ netdev_for_each_lower_private_rcu((bond)->dev, pos, iter) #define BOND_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \ NETIF_F_GSO_ESP) #ifdef CONFIG_NET_POLL_CONTROLLER extern atomic_t netpoll_block_tx; static inline void block_netpoll_tx(void) { atomic_inc(&netpoll_block_tx); } static inline void unblock_netpoll_tx(void) { atomic_dec(&netpoll_block_tx); } static inline int is_netpoll_tx_blocked(struct net_device *dev) { if (unlikely(netpoll_tx_running(dev))) return atomic_read(&netpoll_block_tx); return 0; } #else #define block_netpoll_tx() #define unblock_netpoll_tx() #define is_netpoll_tx_blocked(dev) (0) #endif struct bond_params { int mode; int xmit_policy; int miimon; u8 num_peer_notif; u8 missed_max; int arp_interval; int arp_validate; int arp_all_targets; int use_carrier; int fail_over_mac; int updelay; int downdelay; int peer_notif_delay; int lacp_active; int lacp_fast; unsigned int min_links; int ad_select; char primary[IFNAMSIZ]; int primary_reselect; __be32 arp_targets[BOND_MAX_ARP_TARGETS]; int tx_queues; int all_slaves_active; int resend_igmp; int lp_interval; int packets_per_slave; int tlb_dynamic_lb; struct reciprocal_value reciprocal_packets_per_slave; u16 ad_actor_sys_prio; u16 ad_user_port_key; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; #endif int coupled_control; /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; }; struct slave { struct net_device *dev; /* first - useful for panic debug */ struct bonding *bond; /* our master */ int delay; /* all 4 in jiffies */ unsigned long last_link_up; unsigned long last_tx; unsigned long last_rx; unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; s8 link; /* one of BOND_LINK_XXXX */ s8 link_new_state; /* one of BOND_LINK_XXXX */ u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ inactive:1, /* indicates inactive slave */ rx_disabled:1, /* indicates whether slave's Rx is disabled */ should_notify:1, /* indicates whether the state changed */ should_notify_link:1; /* indicates whether the link changed */ u8 duplex; u32 original_mtu; u32 link_failure_count; u32 speed; u16 queue_id; u8 perm_hwaddr[MAX_ADDR_LEN]; int prio; struct ad_slave_info *ad_info; struct tlb_slave_info tlb_info; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif struct delayed_work notify_work; struct kobject kobj; struct rtnl_link_stats64 slave_stats; }; static inline struct slave *to_slave(struct kobject *kobj) { return container_of(kobj, struct slave, kobj); } struct bond_up_slave { unsigned int count; struct rcu_head rcu; struct slave *arr[]; }; /* * Link pseudo-state only used internally by monitors */ #define BOND_LINK_NOCHANGE -1 struct bond_ipsec { struct list_head list; struct xfrm_state *xs; }; /* * Here are the locking policies for the two bonding locks: * Get rcu_read_lock when reading or RTNL when writing slave list. */ struct bonding { struct net_device *dev; /* first - useful for panic debug */ struct slave __rcu *curr_active_slave; struct slave __rcu *current_arp_slave; struct slave __rcu *primary_slave; struct bond_up_slave __rcu *usable_slaves; struct bond_up_slave __rcu *all_slaves; bool force_primary; bool notifier_ctx; s32 slave_cnt; /* never change this value outside the attach/detach wrappers */ int (*recv_probe)(const struct sk_buff *, struct bonding *, struct slave *); /* mode_lock is used for mode-specific locking needs, currently used by: * 3ad mode (4) - protect against running bond_3ad_unbind_slave() and * bond_3ad_state_machine_handler() concurrently and also * the access to the state machine shared variables. * TLB mode (5) - to sync the use and modifications of its hash table * ALB mode (6) - to sync the use and modifications of its hash table */ spinlock_t mode_lock; spinlock_t stats_lock; u32 send_peer_notif; u8 igmp_retrans; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_entry; char proc_file_name[IFNAMSIZ]; #endif /* CONFIG_PROC_FS */ struct list_head bond_list; u32 __percpu *rr_tx_counter; struct ad_bond_info ad_info; struct alb_bond_info alb_info; struct bond_params params; struct workqueue_struct *wq; struct delayed_work mii_work; struct delayed_work arp_work; struct delayed_work alb_work; struct delayed_work ad_work; struct delayed_work mcast_work; struct delayed_work slave_arr_work; #ifdef CONFIG_DEBUG_FS /* debugging support via debugfs */ struct dentry *debug_dir; #endif /* CONFIG_DEBUG_FS */ struct rtnl_link_stats64 bond_stats; #ifdef CONFIG_XFRM_OFFLOAD struct list_head ipsec_list; /* protecting ipsec_list */ struct mutex ipsec_lock; #endif /* CONFIG_XFRM_OFFLOAD */ struct bpf_prog *xdp_prog; }; #define bond_slave_get_rcu(dev) \ ((struct slave *) rcu_dereference(dev->rx_handler_data)) #define bond_slave_get_rtnl(dev) \ ((struct slave *) rtnl_dereference(dev->rx_handler_data)) void bond_queue_slave_event(struct slave *slave); void bond_lower_state_changed(struct slave *slave); struct bond_vlan_tag { __be16 vlan_proto; unsigned short vlan_id; }; /* * Returns NULL if the net_device does not belong to any of the bond's slaves * * Caller must hold bond lock for read */ static inline struct slave *bond_get_slave_by_dev(struct bonding *bond, struct net_device *slave_dev) { return netdev_lower_dev_get_private(bond->dev, slave_dev); } static inline struct bonding *bond_get_bond_by_slave(struct slave *slave) { return slave->bond; } static inline bool bond_should_override_tx_queue(struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP || BOND_MODE(bond) == BOND_MODE_ROUNDROBIN; } static inline bool bond_is_lb(const struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_TLB || BOND_MODE(bond) == BOND_MODE_ALB; } static inline bool bond_needs_speed_duplex(const struct bonding *bond) { return BOND_MODE(bond) == BOND_MODE_8023AD || bond_is_lb(bond); } static inline bool bond_is_nondyn_tlb(const struct bonding *bond) { return (bond_is_lb(bond) && bond->params.tlb_dynamic_lb == 0); } static inline bool bond_mode_can_use_xmit_hash(const struct bonding *bond) { return (BOND_MODE(bond) == BOND_MODE_8023AD || BOND_MODE(bond) == BOND_MODE_XOR || BOND_MODE(bond) == BOND_MODE_TLB || BOND_MODE(bond) == BOND_MODE_ALB); } static inline bool bond_mode_uses_xmit_hash(const struct bonding *bond) { return (BOND_MODE(bond) == BOND_MODE_8023AD || BOND_MODE(bond) == BOND_MODE_XOR || bond_is_nondyn_tlb(bond)); } static inline bool bond_mode_uses_arp(int mode) { return mode != BOND_MODE_8023AD && mode != BOND_MODE_TLB && mode != BOND_MODE_ALB; } static inline bool bond_mode_uses_primary(int mode) { return mode == BOND_MODE_ACTIVEBACKUP || mode == BOND_MODE_TLB || mode == BOND_MODE_ALB; } static inline bool bond_uses_primary(struct bonding *bond) { return bond_mode_uses_primary(BOND_MODE(bond)); } static inline struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond) { struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave); return bond_uses_primary(bond) && slave ? slave->dev : NULL; } static inline bool bond_slave_is_up(struct slave *slave) { return netif_running(slave->dev) && netif_carrier_ok(slave->dev); } static inline void bond_set_active_slave(struct slave *slave) { if (slave->backup) { slave->backup = 0; bond_queue_slave_event(slave); bond_lower_state_changed(slave); } } static inline void bond_set_backup_slave(struct slave *slave) { if (!slave->backup) { slave->backup = 1; bond_queue_slave_event(slave); bond_lower_state_changed(slave); } } static inline void bond_set_slave_state(struct slave *slave, int slave_state, bool notify) { if (slave->backup == slave_state) return; slave->backup = slave_state; if (notify) { bond_lower_state_changed(slave); bond_queue_slave_event(slave); slave->should_notify = 0; } else { if (slave->should_notify) slave->should_notify = 0; else slave->should_notify = 1; } } static inline void bond_slave_state_change(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->link == BOND_LINK_UP) bond_set_active_slave(tmp); else if (tmp->link == BOND_LINK_DOWN) bond_set_backup_slave(tmp); } } static inline void bond_slave_state_notify(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify) { bond_lower_state_changed(tmp); tmp->should_notify = 0; } } } static inline int bond_slave_state(struct slave *slave) { return slave->backup; } static inline bool bond_is_active_slave(struct slave *slave) { return !bond_slave_state(slave); } static inline bool bond_slave_can_tx(struct slave *slave) { return bond_slave_is_up(slave) && slave->link == BOND_LINK_UP && bond_is_active_slave(slave); } static inline bool bond_is_active_slave_dev(const struct net_device *slave_dev) { struct slave *slave; bool active; rcu_read_lock(); slave = bond_slave_get_rcu(slave_dev); active = bond_is_active_slave(slave); rcu_read_unlock(); return active; } static inline void bond_hw_addr_copy(u8 *dst, const u8 *src, unsigned int len) { if (len == ETH_ALEN) { ether_addr_copy(dst, src); return; } memcpy(dst, src, len); } #define BOND_PRI_RESELECT_ALWAYS 0 #define BOND_PRI_RESELECT_BETTER 1 #define BOND_PRI_RESELECT_FAILURE 2 #define BOND_FOM_NONE 0 #define BOND_FOM_ACTIVE 1 #define BOND_FOM_FOLLOW 2 #define BOND_ARP_TARGETS_ANY 0 #define BOND_ARP_TARGETS_ALL 1 #define BOND_ARP_VALIDATE_NONE 0 #define BOND_ARP_VALIDATE_ACTIVE (1 << BOND_STATE_ACTIVE) #define BOND_ARP_VALIDATE_BACKUP (1 << BOND_STATE_BACKUP) #define BOND_ARP_VALIDATE_ALL (BOND_ARP_VALIDATE_ACTIVE | \ BOND_ARP_VALIDATE_BACKUP) #define BOND_ARP_FILTER (BOND_ARP_VALIDATE_ALL + 1) #define BOND_ARP_FILTER_ACTIVE (BOND_ARP_VALIDATE_ACTIVE | \ BOND_ARP_FILTER) #define BOND_ARP_FILTER_BACKUP (BOND_ARP_VALIDATE_BACKUP | \ BOND_ARP_FILTER) #define BOND_SLAVE_NOTIFY_NOW true #define BOND_SLAVE_NOTIFY_LATER false static inline int slave_do_arp_validate(struct bonding *bond, struct slave *slave) { return bond->params.arp_validate & (1 << bond_slave_state(slave)); } static inline int slave_do_arp_validate_only(struct bonding *bond) { return bond->params.arp_validate & BOND_ARP_FILTER; } static inline int bond_is_ip_target_ok(__be32 addr) { return !ipv4_is_lbcast(addr) && !ipv4_is_zeronet(addr); } #if IS_ENABLED(CONFIG_IPV6) static inline int bond_is_ip6_target_ok(struct in6_addr *addr) { return !ipv6_addr_any(addr) && !ipv6_addr_loopback(addr) && !ipv6_addr_is_multicast(addr); } #endif /* Get the oldest arp which we've received on this slave for bond's * arp_targets. */ static inline unsigned long slave_oldest_target_arp_rx(struct bonding *bond, struct slave *slave) { int i = 1; unsigned long ret = slave->target_last_arp_rx[0]; for (; (i < BOND_MAX_ARP_TARGETS) && bond->params.arp_targets[i]; i++) if (time_before(slave->target_last_arp_rx[i], ret)) ret = slave->target_last_arp_rx[i]; return ret; } static inline unsigned long slave_last_rx(struct bonding *bond, struct slave *slave) { if (bond->params.arp_all_targets == BOND_ARP_TARGETS_ALL) return slave_oldest_target_arp_rx(bond, slave); return slave->last_rx; } static inline void slave_update_last_tx(struct slave *slave) { WRITE_ONCE(slave->last_tx, jiffies); } static inline unsigned long slave_last_tx(struct slave *slave) { return READ_ONCE(slave->last_tx); } #ifdef CONFIG_NET_POLL_CONTROLLER static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) { return netpoll_send_skb(slave->np, skb); } #else static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) { BUG(); return NETDEV_TX_OK; } #endif static inline void bond_set_slave_inactive_flags(struct slave *slave, bool notify) { if (!bond_is_lb(slave->bond)) bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); if (!slave->bond->params.all_slaves_active) slave->inactive = 1; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) slave->rx_disabled = 1; } static inline void bond_set_slave_tx_disabled_flags(struct slave *slave, bool notify) { bond_set_slave_state(slave, BOND_STATE_BACKUP, notify); } static inline void bond_set_slave_active_flags(struct slave *slave, bool notify) { bond_set_slave_state(slave, BOND_STATE_ACTIVE, notify); slave->inactive = 0; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) slave->rx_disabled = 0; } static inline void bond_set_slave_rx_enabled_flags(struct slave *slave, bool notify) { slave->rx_disabled = 0; } static inline bool bond_is_slave_inactive(struct slave *slave) { return slave->inactive; } static inline bool bond_is_slave_rx_disabled(struct slave *slave) { return slave->rx_disabled; } static inline void bond_propose_link_state(struct slave *slave, int state) { slave->link_new_state = state; } static inline void bond_commit_link_state(struct slave *slave, bool notify) { if (slave->link_new_state == BOND_LINK_NOCHANGE) return; slave->link = slave->link_new_state; if (notify) { bond_queue_slave_event(slave); bond_lower_state_changed(slave); slave->should_notify_link = 0; } else { if (slave->should_notify_link) slave->should_notify_link = 0; else slave->should_notify_link = 1; } } static inline void bond_set_slave_link_state(struct slave *slave, int state, bool notify) { bond_propose_link_state(slave, state); bond_commit_link_state(slave, notify); } static inline void bond_slave_link_notify(struct bonding *bond) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify_link) { bond_queue_slave_event(tmp); bond_lower_state_changed(tmp); tmp->should_notify_link = 0; } } } static inline __be32 bond_confirm_addr(struct net_device *dev, __be32 dst, __be32 local) { struct in_device *in_dev; __be32 addr = 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) addr = inet_confirm_addr(dev_net(dev), in_dev, dst, local, RT_SCOPE_HOST); rcu_read_unlock(); return addr; } struct bond_net { struct net *net; /* Associated network namespace */ struct list_head dev_list; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_dir; #endif struct class_attribute class_attr_bonding_masters; }; int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev); int bond_create(struct net *net, const char *name); int bond_create_sysfs(struct bond_net *net); void bond_destroy_sysfs(struct bond_net *net); void bond_prepare_sysfs_group(struct bonding *bond); int bond_sysfs_slave_add(struct slave *slave); void bond_sysfs_slave_del(struct slave *slave); void bond_xdp_set_features(struct net_device *bond_dev); int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, struct netlink_ext_ack *extack); int bond_release(struct net_device *bond_dev, struct net_device *slave_dev); u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb); int bond_set_carrier(struct bonding *bond); void bond_select_active_slave(struct bonding *bond); void bond_change_active_slave(struct bonding *bond, struct slave *new_active); void bond_create_debugfs(void); void bond_destroy_debugfs(void); void bond_debug_register(struct bonding *bond); void bond_debug_unregister(struct bonding *bond); void bond_debug_reregister(struct bonding *bond); const char *bond_mode_name(int mode); bool bond_xdp_check(struct bonding *bond, int mode); void bond_setup(struct net_device *bond_dev); unsigned int bond_get_num_tx_queues(void); int bond_netlink_init(void); void bond_netlink_fini(void); struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond); const char *bond_slave_link_status(s8 link); struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, struct net_device *end_dev, int level); int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave); void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay); void bond_work_init_all(struct bonding *bond); #ifdef CONFIG_PROC_FS void bond_create_proc_entry(struct bonding *bond); void bond_remove_proc_entry(struct bonding *bond); void bond_create_proc_dir(struct bond_net *bn); void bond_destroy_proc_dir(struct bond_net *bn); #else static inline void bond_create_proc_entry(struct bonding *bond) { } static inline void bond_remove_proc_entry(struct bonding *bond) { } static inline void bond_create_proc_dir(struct bond_net *bn) { } static inline void bond_destroy_proc_dir(struct bond_net *bn) { } #endif static inline struct slave *bond_slave_has_mac(struct bonding *bond, const u8 *mac) { struct list_head *iter; struct slave *tmp; bond_for_each_slave(bond, tmp, iter) if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) return tmp; return NULL; } /* Caller must hold rcu_read_lock() for read */ static inline bool bond_slave_has_mac_rcu(struct bonding *bond, const u8 *mac) { struct list_head *iter; struct slave *tmp; bond_for_each_slave_rcu(bond, tmp, iter) if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) return true; return false; } /* Check if the ip is present in arp ip list, or first free slot if ip == 0 * Returns -1 if not found, index if found */ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) { int i; for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) if (targets[i] == ip) return i; else if (targets[i] == 0) break; return -1; } #if IS_ENABLED(CONFIG_IPV6) static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip) { struct in6_addr mcaddr; int i; for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { addrconf_addr_solict_mult(&targets[i], &mcaddr); if ((ipv6_addr_equal(&targets[i], ip)) || (ipv6_addr_equal(&mcaddr, ip))) return i; else if (ipv6_addr_any(&targets[i])) break; } return -1; } #endif /* exported from bond_main.c */ extern unsigned int bond_net_id; /* exported from bond_netlink.c */ extern struct rtnl_link_ops bond_link_ops; /* exported from bond_sysfs_slave.c */ extern const struct sysfs_ops slave_sysfs_ops; /* exported from bond_3ad.c */ extern const u8 lacpdu_mcast_addr[]; static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb) { dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); return NET_XMIT_DROP; } #endif /* _NET_BONDING_H */ |
3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef DRIVERS_PCI_H #define DRIVERS_PCI_H #include <linux/pci.h> struct pcie_tlp_log; /* Number of possible devfns: 0.0 to 1f.7 inclusive */ #define MAX_NR_DEVFNS 256 #define PCI_FIND_CAP_TTL 48 #define PCI_VSEC_ID_INTEL_TBT 0x1234 /* Thunderbolt */ #define PCIE_LINK_RETRAIN_TIMEOUT_MS 1000 /* * Power stable to PERST# inactive. * * See the "Power Sequencing and Reset Signal Timings" table of the PCI Express * Card Electromechanical Specification, Revision 5.1, Section 2.9.2, Symbol * "T_PVPERL". */ #define PCIE_T_PVPERL_MS 100 /* * REFCLK stable before PERST# inactive. * * See the "Power Sequencing and Reset Signal Timings" table of the PCI Express * Card Electromechanical Specification, Revision 5.1, Section 2.9.2, Symbol * "T_PERST-CLK". */ #define PCIE_T_PERST_CLK_US 100 /* * End of conventional reset (PERST# de-asserted) to first configuration * request (device able to respond with a "Request Retry Status" completion), * from PCIe r6.0, sec 6.6.1. */ #define PCIE_T_RRS_READY_MS 100 /* * PCIe r6.0, sec 5.3.3.2.1 <PME Synchronization> * Recommends 1ms to 10ms timeout to check L2 ready. */ #define PCIE_PME_TO_L2_TIMEOUT_US 10000 /* * PCIe r6.0, sec 6.6.1 <Conventional Reset> * * - "With a Downstream Port that does not support Link speeds greater * than 5.0 GT/s, software must wait a minimum of 100 ms following exit * from a Conventional Reset before sending a Configuration Request to * the device immediately below that Port." * * - "With a Downstream Port that supports Link speeds greater than * 5.0 GT/s, software must wait a minimum of 100 ms after Link training * completes before sending a Configuration Request to the device * immediately below that Port." */ #define PCIE_RESET_CONFIG_DEVICE_WAIT_MS 100 /* Message Routing (r[2:0]); PCIe r6.0, sec 2.2.8 */ #define PCIE_MSG_TYPE_R_RC 0 #define PCIE_MSG_TYPE_R_ADDR 1 #define PCIE_MSG_TYPE_R_ID 2 #define PCIE_MSG_TYPE_R_BC 3 #define PCIE_MSG_TYPE_R_LOCAL 4 #define PCIE_MSG_TYPE_R_GATHER 5 /* Power Management Messages; PCIe r6.0, sec 2.2.8.2 */ #define PCIE_MSG_CODE_PME_TURN_OFF 0x19 /* INTx Mechanism Messages; PCIe r6.0, sec 2.2.8.1 */ #define PCIE_MSG_CODE_ASSERT_INTA 0x20 #define PCIE_MSG_CODE_ASSERT_INTB 0x21 #define PCIE_MSG_CODE_ASSERT_INTC 0x22 #define PCIE_MSG_CODE_ASSERT_INTD 0x23 #define PCIE_MSG_CODE_DEASSERT_INTA 0x24 #define PCIE_MSG_CODE_DEASSERT_INTB 0x25 #define PCIE_MSG_CODE_DEASSERT_INTC 0x26 #define PCIE_MSG_CODE_DEASSERT_INTD 0x27 extern const unsigned char pcie_link_speed[]; extern bool pci_early_dump; bool pcie_cap_has_lnkctl(const struct pci_dev *dev); bool pcie_cap_has_lnkctl2(const struct pci_dev *dev); bool pcie_cap_has_rtctl(const struct pci_dev *dev); /* Functions internal to the PCI core code */ #ifdef CONFIG_DMI extern const struct attribute_group pci_dev_smbios_attr_group; #endif enum pci_mmap_api { PCI_MMAP_SYSFS, /* mmap on /sys/bus/pci/devices/<BDF>/resource<N> */ PCI_MMAP_PROCFS /* mmap on /proc/bus/pci/<BDF> */ }; int pci_mmap_fits(struct pci_dev *pdev, int resno, struct vm_area_struct *vmai, enum pci_mmap_api mmap_api); bool pci_reset_supported(struct pci_dev *dev); void pci_init_reset_methods(struct pci_dev *dev); int pci_bridge_secondary_bus_reset(struct pci_dev *dev); int pci_bus_error_reset(struct pci_dev *dev); int __pci_reset_bus(struct pci_bus *bus); struct pci_cap_saved_data { u16 cap_nr; bool cap_extended; unsigned int size; u32 data[]; }; struct pci_cap_saved_state { struct hlist_node next; struct pci_cap_saved_data cap; }; void pci_allocate_cap_save_buffers(struct pci_dev *dev); void pci_free_cap_save_buffers(struct pci_dev *dev); int pci_add_cap_save_buffer(struct pci_dev *dev, char cap, unsigned int size); int pci_add_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, unsigned int size); struct pci_cap_saved_state *pci_find_saved_cap(struct pci_dev *dev, char cap); struct pci_cap_saved_state *pci_find_saved_ext_cap(struct pci_dev *dev, u16 cap); #define PCI_PM_D2_DELAY 200 /* usec; see PCIe r4.0, sec 5.9.1 */ #define PCI_PM_D3HOT_WAIT 10 /* msec */ #define PCI_PM_D3COLD_WAIT 100 /* msec */ void pci_update_current_state(struct pci_dev *dev, pci_power_t state); void pci_refresh_power_state(struct pci_dev *dev); int pci_power_up(struct pci_dev *dev); void pci_disable_enabled_device(struct pci_dev *dev); int pci_finish_runtime_suspend(struct pci_dev *dev); void pcie_clear_device_status(struct pci_dev *dev); void pcie_clear_root_pme_status(struct pci_dev *dev); bool pci_check_pme_status(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); void pci_pme_restore(struct pci_dev *dev); bool pci_dev_need_resume(struct pci_dev *dev); void pci_dev_adjust_pme(struct pci_dev *dev); void pci_dev_complete_resume(struct pci_dev *pci_dev); void pci_config_pm_runtime_get(struct pci_dev *dev); void pci_config_pm_runtime_put(struct pci_dev *dev); void pci_pm_init(struct pci_dev *dev); void pci_ea_init(struct pci_dev *dev); void pci_msi_init(struct pci_dev *dev); void pci_msix_init(struct pci_dev *dev); bool pci_bridge_d3_possible(struct pci_dev *dev); void pci_bridge_d3_update(struct pci_dev *dev); int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type); static inline bool pci_bus_rrs_vendor_id(u32 l) { return (l & 0xffff) == PCI_VENDOR_ID_PCI_SIG; } static inline void pci_wakeup_event(struct pci_dev *dev) { /* Wait 100 ms before the system can be put into a sleep state. */ pm_wakeup_event(&dev->dev, 100); } /** * pci_bar_index_is_valid - Check whether a BAR index is within valid range * @bar: BAR index * * Protects against overflowing &struct pci_dev.resource array. * * Return: true for valid index, false otherwise. */ static inline bool pci_bar_index_is_valid(int bar) { if (bar >= 0 && bar < PCI_NUM_RESOURCES) return true; return false; } static inline bool pci_has_subordinate(struct pci_dev *pci_dev) { return !!(pci_dev->subordinate); } static inline bool pci_power_manageable(struct pci_dev *pci_dev) { /* * Currently we allow normal PCI devices and PCI bridges transition * into D3 if their bridge_d3 is set. */ return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3; } static inline bool pcie_downstream_port(const struct pci_dev *dev) { int type = pci_pcie_type(dev); return type == PCI_EXP_TYPE_ROOT_PORT || type == PCI_EXP_TYPE_DOWNSTREAM || type == PCI_EXP_TYPE_PCIE_BRIDGE; } void pci_vpd_init(struct pci_dev *dev); extern const struct attribute_group pci_dev_vpd_attr_group; /* PCI Virtual Channel */ int pci_save_vc_state(struct pci_dev *dev); void pci_restore_vc_state(struct pci_dev *dev); void pci_allocate_vc_save_buffers(struct pci_dev *dev); /* PCI /proc functions */ #ifdef CONFIG_PROC_FS int pci_proc_attach_device(struct pci_dev *dev); int pci_proc_detach_device(struct pci_dev *dev); int pci_proc_detach_bus(struct pci_bus *bus); #else static inline int pci_proc_attach_device(struct pci_dev *dev) { return 0; } static inline int pci_proc_detach_device(struct pci_dev *dev) { return 0; } static inline int pci_proc_detach_bus(struct pci_bus *bus) { return 0; } #endif /* Functions for PCI Hotplug drivers to use */ int pci_hp_add_bridge(struct pci_dev *dev); #if defined(CONFIG_SYSFS) && defined(HAVE_PCI_LEGACY) void pci_create_legacy_files(struct pci_bus *bus); void pci_remove_legacy_files(struct pci_bus *bus); #else static inline void pci_create_legacy_files(struct pci_bus *bus) { } static inline void pci_remove_legacy_files(struct pci_bus *bus) { } #endif /* Lock for read/write access to pci device and bus lists */ extern struct rw_semaphore pci_bus_sem; extern struct mutex pci_slot_mutex; extern raw_spinlock_t pci_lock; extern unsigned int pci_pm_d3hot_delay; #ifdef CONFIG_PCI_MSI void pci_no_msi(void); #else static inline void pci_no_msi(void) { } #endif void pci_realloc_get_opt(char *); static inline int pci_no_d1d2(struct pci_dev *dev) { unsigned int parent_dstates = 0; if (dev->bus->self) parent_dstates = dev->bus->self->no_d1d2; return (dev->no_d1d2 || parent_dstates); } #ifdef CONFIG_SYSFS int pci_create_sysfs_dev_files(struct pci_dev *pdev); void pci_remove_sysfs_dev_files(struct pci_dev *pdev); extern const struct attribute_group *pci_dev_groups[]; extern const struct attribute_group *pci_dev_attr_groups[]; extern const struct attribute_group *pcibus_groups[]; extern const struct attribute_group *pci_bus_groups[]; extern const struct attribute_group pci_doe_sysfs_group; #else static inline int pci_create_sysfs_dev_files(struct pci_dev *pdev) { return 0; } static inline void pci_remove_sysfs_dev_files(struct pci_dev *pdev) { } #define pci_dev_groups NULL #define pci_dev_attr_groups NULL #define pcibus_groups NULL #define pci_bus_groups NULL #endif extern unsigned long pci_hotplug_io_size; extern unsigned long pci_hotplug_mmio_size; extern unsigned long pci_hotplug_mmio_pref_size; extern unsigned long pci_hotplug_bus_size; extern unsigned long pci_cardbus_io_size; extern unsigned long pci_cardbus_mem_size; /** * pci_match_one_device - Tell if a PCI device structure has a matching * PCI device id structure * @id: single PCI device id structure to match * @dev: the PCI device structure to match against * * Returns the matching pci_device_id structure or %NULL if there is no match. */ static inline const struct pci_device_id * pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) { if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) && (id->device == PCI_ANY_ID || id->device == dev->device) && (id->subvendor == PCI_ANY_ID || id->subvendor == dev->subsystem_vendor) && (id->subdevice == PCI_ANY_ID || id->subdevice == dev->subsystem_device) && !((id->class ^ dev->class) & id->class_mask)) return id; return NULL; } /* PCI slot sysfs helper code */ #define to_pci_slot(s) container_of(s, struct pci_slot, kobj) extern struct kset *pci_slots_kset; struct pci_slot_attribute { struct attribute attr; ssize_t (*show)(struct pci_slot *, char *); ssize_t (*store)(struct pci_slot *, const char *, size_t); }; #define to_pci_slot_attr(s) container_of(s, struct pci_slot_attribute, attr) enum pci_bar_type { pci_bar_unknown, /* Standard PCI BAR probe */ pci_bar_io, /* An I/O port BAR */ pci_bar_mem32, /* A 32-bit memory BAR */ pci_bar_mem64, /* A 64-bit memory BAR */ }; struct device *pci_get_host_bridge_device(struct pci_dev *dev); void pci_put_host_bridge_device(struct device *dev); unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge); int pci_reassign_bridge_resources(struct pci_dev *bridge, unsigned long type); int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align); int pci_configure_extended_tags(struct pci_dev *dev, void *ign); bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int rrs_timeout); bool pci_bus_generic_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int rrs_timeout); int pci_idt_bus_quirk(struct pci_bus *bus, int devfn, u32 *pl, int rrs_timeout); int pci_setup_device(struct pci_dev *dev); void __pci_size_stdbars(struct pci_dev *dev, int count, unsigned int pos, u32 *sizes); int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, struct resource *res, unsigned int reg, u32 *sizes); void pci_configure_ari(struct pci_dev *dev); void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head); void __pci_bus_assign_resources(const struct pci_bus *bus, struct list_head *realloc_head, struct list_head *fail_head); bool pci_bus_clip_resource(struct pci_dev *dev, int idx); void pci_walk_bus_locked(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata); const char *pci_resource_name(struct pci_dev *dev, unsigned int i); bool pci_resource_is_optional(const struct pci_dev *dev, int resno); /** * pci_resource_num - Reverse lookup resource number from device resources * @dev: PCI device * @res: Resource to lookup index for (MUST be a @dev's resource) * * Perform reverse lookup to determine the resource number for @res within * @dev resource array. NOTE: The caller is responsible for ensuring @res is * among @dev's resources! * * Returns: resource number. */ static inline int pci_resource_num(const struct pci_dev *dev, const struct resource *res) { int resno = res - &dev->resource[0]; /* Passing a resource that is not among dev's resources? */ WARN_ON_ONCE(resno >= PCI_NUM_RESOURCES); return resno; } void pci_reassigndev_resource_alignment(struct pci_dev *dev); void pci_disable_bridge_window(struct pci_dev *dev); struct pci_bus *pci_bus_get(struct pci_bus *bus); void pci_bus_put(struct pci_bus *bus); #define PCIE_LNKCAP_SLS2SPEED(lnkcap) \ ({ \ ((lnkcap) == PCI_EXP_LNKCAP_SLS_64_0GB ? PCIE_SPEED_64_0GT : \ (lnkcap) == PCI_EXP_LNKCAP_SLS_32_0GB ? PCIE_SPEED_32_0GT : \ (lnkcap) == PCI_EXP_LNKCAP_SLS_16_0GB ? PCIE_SPEED_16_0GT : \ (lnkcap) == PCI_EXP_LNKCAP_SLS_8_0GB ? PCIE_SPEED_8_0GT : \ (lnkcap) == PCI_EXP_LNKCAP_SLS_5_0GB ? PCIE_SPEED_5_0GT : \ (lnkcap) == PCI_EXP_LNKCAP_SLS_2_5GB ? PCIE_SPEED_2_5GT : \ PCI_SPEED_UNKNOWN); \ }) /* PCIe link information from Link Capabilities 2 */ #define PCIE_LNKCAP2_SLS2SPEED(lnkcap2) \ ((lnkcap2) & PCI_EXP_LNKCAP2_SLS_64_0GB ? PCIE_SPEED_64_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_32_0GB ? PCIE_SPEED_32_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_16_0GB ? PCIE_SPEED_16_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_8_0GB ? PCIE_SPEED_8_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_5_0GB ? PCIE_SPEED_5_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_2_5GB ? PCIE_SPEED_2_5GT : \ PCI_SPEED_UNKNOWN) #define PCIE_LNKCTL2_TLS2SPEED(lnkctl2) \ ((lnkctl2) == PCI_EXP_LNKCTL2_TLS_64_0GT ? PCIE_SPEED_64_0GT : \ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_32_0GT ? PCIE_SPEED_32_0GT : \ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_16_0GT ? PCIE_SPEED_16_0GT : \ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_8_0GT ? PCIE_SPEED_8_0GT : \ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_5_0GT ? PCIE_SPEED_5_0GT : \ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_2_5GT ? PCIE_SPEED_2_5GT : \ PCI_SPEED_UNKNOWN) /* PCIe speed to Mb/s reduced by encoding overhead */ #define PCIE_SPEED2MBS_ENC(speed) \ ((speed) == PCIE_SPEED_64_0GT ? 64000*1/1 : \ (speed) == PCIE_SPEED_32_0GT ? 32000*128/130 : \ (speed) == PCIE_SPEED_16_0GT ? 16000*128/130 : \ (speed) == PCIE_SPEED_8_0GT ? 8000*128/130 : \ (speed) == PCIE_SPEED_5_0GT ? 5000*8/10 : \ (speed) == PCIE_SPEED_2_5GT ? 2500*8/10 : \ 0) static inline int pcie_dev_speed_mbps(enum pci_bus_speed speed) { switch (speed) { case PCIE_SPEED_2_5GT: return 2500; case PCIE_SPEED_5_0GT: return 5000; case PCIE_SPEED_8_0GT: return 8000; case PCIE_SPEED_16_0GT: return 16000; case PCIE_SPEED_32_0GT: return 32000; case PCIE_SPEED_64_0GT: return 64000; default: break; } return -EINVAL; } u8 pcie_get_supported_speeds(struct pci_dev *dev); const char *pci_speed_string(enum pci_bus_speed speed); void __pcie_print_link_status(struct pci_dev *dev, bool verbose); void pcie_report_downtraining(struct pci_dev *dev); static inline void __pcie_update_link_speed(struct pci_bus *bus, u16 linksta, u16 linksta2) { bus->cur_bus_speed = pcie_link_speed[linksta & PCI_EXP_LNKSTA_CLS]; bus->flit_mode = (linksta2 & PCI_EXP_LNKSTA2_FLIT) ? 1 : 0; } void pcie_update_link_speed(struct pci_bus *bus); /* Single Root I/O Virtualization */ struct pci_sriov { int pos; /* Capability position */ int nres; /* Number of resources */ u32 cap; /* SR-IOV Capabilities */ u16 ctrl; /* SR-IOV Control */ u16 total_VFs; /* Total VFs associated with the PF */ u16 initial_VFs; /* Initial VFs associated with the PF */ u16 num_VFs; /* Number of VFs available */ u16 offset; /* First VF Routing ID offset */ u16 stride; /* Following VF stride */ u16 vf_device; /* VF device ID */ u32 pgsz; /* Page size for BAR alignment */ u8 link; /* Function Dependency Link */ u8 max_VF_buses; /* Max buses consumed by VFs */ u16 driver_max_VFs; /* Max num VFs driver supports */ struct pci_dev *dev; /* Lowest numbered PF */ struct pci_dev *self; /* This PF */ u32 class; /* VF device */ u8 hdr_type; /* VF header type */ u16 subsystem_vendor; /* VF subsystem vendor */ u16 subsystem_device; /* VF subsystem device */ resource_size_t barsz[PCI_SRIOV_NUM_BARS]; /* VF BAR size */ bool drivers_autoprobe; /* Auto probing of VFs by driver */ }; #ifdef CONFIG_PCI_DOE void pci_doe_init(struct pci_dev *pdev); void pci_doe_destroy(struct pci_dev *pdev); void pci_doe_disconnected(struct pci_dev *pdev); #else static inline void pci_doe_init(struct pci_dev *pdev) { } static inline void pci_doe_destroy(struct pci_dev *pdev) { } static inline void pci_doe_disconnected(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCI_NPEM void pci_npem_create(struct pci_dev *dev); void pci_npem_remove(struct pci_dev *dev); #else static inline void pci_npem_create(struct pci_dev *dev) { } static inline void pci_npem_remove(struct pci_dev *dev) { } #endif #if defined(CONFIG_PCI_DOE) && defined(CONFIG_SYSFS) void pci_doe_sysfs_init(struct pci_dev *pci_dev); void pci_doe_sysfs_teardown(struct pci_dev *pdev); #else static inline void pci_doe_sysfs_init(struct pci_dev *pdev) { } static inline void pci_doe_sysfs_teardown(struct pci_dev *pdev) { } #endif /** * pci_dev_set_io_state - Set the new error state if possible. * * @dev: PCI device to set new error_state * @new: the state we want dev to be in * * If the device is experiencing perm_failure, it has to remain in that state. * Any other transition is allowed. * * Returns true if state has been changed to the requested state. */ static inline bool pci_dev_set_io_state(struct pci_dev *dev, pci_channel_state_t new) { pci_channel_state_t old; switch (new) { case pci_channel_io_perm_failure: xchg(&dev->error_state, pci_channel_io_perm_failure); return true; case pci_channel_io_frozen: old = cmpxchg(&dev->error_state, pci_channel_io_normal, pci_channel_io_frozen); return old != pci_channel_io_perm_failure; case pci_channel_io_normal: old = cmpxchg(&dev->error_state, pci_channel_io_frozen, pci_channel_io_normal); return old != pci_channel_io_perm_failure; default: return false; } } static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused) { pci_dev_set_io_state(dev, pci_channel_io_perm_failure); pci_doe_disconnected(dev); return 0; } /* pci_dev priv_flags */ #define PCI_DEV_ADDED 0 #define PCI_DPC_RECOVERED 1 #define PCI_DPC_RECOVERING 2 #define PCI_DEV_REMOVED 3 static inline void pci_dev_assign_added(struct pci_dev *dev) { smp_mb__before_atomic(); set_bit(PCI_DEV_ADDED, &dev->priv_flags); smp_mb__after_atomic(); } static inline bool pci_dev_test_and_clear_added(struct pci_dev *dev) { return test_and_clear_bit(PCI_DEV_ADDED, &dev->priv_flags); } static inline bool pci_dev_is_added(const struct pci_dev *dev) { return test_bit(PCI_DEV_ADDED, &dev->priv_flags); } static inline bool pci_dev_test_and_set_removed(struct pci_dev *dev) { return test_and_set_bit(PCI_DEV_REMOVED, &dev->priv_flags); } #ifdef CONFIG_PCIEAER #include <linux/aer.h> #define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */ struct aer_err_info { struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; int error_dev_num; unsigned int id:16; unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */ unsigned int __pad1:5; unsigned int multi_error_valid:1; unsigned int first_error:5; unsigned int __pad2:2; unsigned int tlp_header_valid:1; unsigned int status; /* COR/UNCOR Error Status */ unsigned int mask; /* COR/UNCOR Error Mask */ struct pcie_tlp_log tlp; /* TLP Header */ }; int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info); void aer_print_error(struct pci_dev *dev, struct aer_err_info *info); int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2, unsigned int tlp_len, bool flit, struct pcie_tlp_log *log); unsigned int aer_tlp_log_len(struct pci_dev *dev, u32 aercc); void pcie_print_tlp_log(const struct pci_dev *dev, const struct pcie_tlp_log *log, const char *pfx); #endif /* CONFIG_PCIEAER */ #ifdef CONFIG_PCIEPORTBUS /* Cached RCEC Endpoint Association */ struct rcec_ea { u8 nextbusn; u8 lastbusn; u32 bitmap; }; #endif #ifdef CONFIG_PCIE_DPC void pci_save_dpc_state(struct pci_dev *dev); void pci_restore_dpc_state(struct pci_dev *dev); void pci_dpc_init(struct pci_dev *pdev); void dpc_process_error(struct pci_dev *pdev); pci_ers_result_t dpc_reset_link(struct pci_dev *pdev); bool pci_dpc_recovered(struct pci_dev *pdev); unsigned int dpc_tlp_log_len(struct pci_dev *dev); #else static inline void pci_save_dpc_state(struct pci_dev *dev) { } static inline void pci_restore_dpc_state(struct pci_dev *dev) { } static inline void pci_dpc_init(struct pci_dev *pdev) { } static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; } #endif #ifdef CONFIG_PCIEPORTBUS void pci_rcec_init(struct pci_dev *dev); void pci_rcec_exit(struct pci_dev *dev); void pcie_link_rcec(struct pci_dev *rcec); void pcie_walk_rcec(struct pci_dev *rcec, int (*cb)(struct pci_dev *, void *), void *userdata); #else static inline void pci_rcec_init(struct pci_dev *dev) { } static inline void pci_rcec_exit(struct pci_dev *dev) { } static inline void pcie_link_rcec(struct pci_dev *rcec) { } static inline void pcie_walk_rcec(struct pci_dev *rcec, int (*cb)(struct pci_dev *, void *), void *userdata) { } #endif #ifdef CONFIG_PCI_ATS /* Address Translation Service */ void pci_ats_init(struct pci_dev *dev); void pci_restore_ats_state(struct pci_dev *dev); #else static inline void pci_ats_init(struct pci_dev *d) { } static inline void pci_restore_ats_state(struct pci_dev *dev) { } #endif /* CONFIG_PCI_ATS */ #ifdef CONFIG_PCI_PRI void pci_pri_init(struct pci_dev *dev); void pci_restore_pri_state(struct pci_dev *pdev); #else static inline void pci_pri_init(struct pci_dev *dev) { } static inline void pci_restore_pri_state(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCI_PASID void pci_pasid_init(struct pci_dev *dev); void pci_restore_pasid_state(struct pci_dev *pdev); #else static inline void pci_pasid_init(struct pci_dev *dev) { } static inline void pci_restore_pasid_state(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCI_IOV int pci_iov_init(struct pci_dev *dev); void pci_iov_release(struct pci_dev *dev); void pci_iov_remove(struct pci_dev *dev); void pci_iov_update_resource(struct pci_dev *dev, int resno); resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno); void pci_restore_iov_state(struct pci_dev *dev); int pci_iov_bus_range(struct pci_bus *bus); static inline bool pci_resource_is_iov(int resno) { return resno >= PCI_IOV_RESOURCES && resno <= PCI_IOV_RESOURCE_END; } extern const struct attribute_group sriov_pf_dev_attr_group; extern const struct attribute_group sriov_vf_dev_attr_group; #else static inline int pci_iov_init(struct pci_dev *dev) { return -ENODEV; } static inline void pci_iov_release(struct pci_dev *dev) { } static inline void pci_iov_remove(struct pci_dev *dev) { } static inline void pci_iov_update_resource(struct pci_dev *dev, int resno) { } static inline resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno) { return 0; } static inline void pci_restore_iov_state(struct pci_dev *dev) { } static inline int pci_iov_bus_range(struct pci_bus *bus) { return 0; } static inline bool pci_resource_is_iov(int resno) { return false; } #endif /* CONFIG_PCI_IOV */ #ifdef CONFIG_PCIE_TPH void pci_restore_tph_state(struct pci_dev *dev); void pci_save_tph_state(struct pci_dev *dev); void pci_no_tph(void); void pci_tph_init(struct pci_dev *dev); #else static inline void pci_restore_tph_state(struct pci_dev *dev) { } static inline void pci_save_tph_state(struct pci_dev *dev) { } static inline void pci_no_tph(void) { } static inline void pci_tph_init(struct pci_dev *dev) { } #endif #ifdef CONFIG_PCIE_PTM void pci_ptm_init(struct pci_dev *dev); void pci_save_ptm_state(struct pci_dev *dev); void pci_restore_ptm_state(struct pci_dev *dev); void pci_suspend_ptm(struct pci_dev *dev); void pci_resume_ptm(struct pci_dev *dev); #else static inline void pci_ptm_init(struct pci_dev *dev) { } static inline void pci_save_ptm_state(struct pci_dev *dev) { } static inline void pci_restore_ptm_state(struct pci_dev *dev) { } static inline void pci_suspend_ptm(struct pci_dev *dev) { } static inline void pci_resume_ptm(struct pci_dev *dev) { } #endif unsigned long pci_cardbus_resource_alignment(struct resource *); static inline resource_size_t pci_resource_alignment(struct pci_dev *dev, struct resource *res) { int resno = pci_resource_num(dev, res); if (pci_resource_is_iov(resno)) return pci_sriov_resource_alignment(dev, resno); if (dev->class >> 8 == PCI_CLASS_BRIDGE_CARDBUS) return pci_cardbus_resource_alignment(res); return resource_alignment(res); } void pci_acs_init(struct pci_dev *dev); #ifdef CONFIG_PCI_QUIRKS int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags); int pci_dev_specific_enable_acs(struct pci_dev *dev); int pci_dev_specific_disable_acs_redir(struct pci_dev *dev); int pcie_failed_link_retrain(struct pci_dev *dev); #else static inline int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags) { return -ENOTTY; } static inline int pci_dev_specific_enable_acs(struct pci_dev *dev) { return -ENOTTY; } static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev) { return -ENOTTY; } static inline int pcie_failed_link_retrain(struct pci_dev *dev) { return -ENOTTY; } #endif /* PCI error reporting and recovery */ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev, pci_channel_state_t state, pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev)); bool pcie_wait_for_link(struct pci_dev *pdev, bool active); int pcie_retrain_link(struct pci_dev *pdev, bool use_lt); /* ASPM-related functionality we need even without CONFIG_PCIEASPM */ void pci_save_ltr_state(struct pci_dev *dev); void pci_restore_ltr_state(struct pci_dev *dev); void pci_configure_aspm_l1ss(struct pci_dev *dev); void pci_save_aspm_l1ss_state(struct pci_dev *dev); void pci_restore_aspm_l1ss_state(struct pci_dev *dev); #ifdef CONFIG_PCIEASPM void pcie_aspm_init_link_state(struct pci_dev *pdev); void pcie_aspm_exit_link_state(struct pci_dev *pdev); void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked); void pcie_aspm_powersave_config_link(struct pci_dev *pdev); void pci_configure_ltr(struct pci_dev *pdev); void pci_bridge_reconfigure_ltr(struct pci_dev *pdev); #else static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked) { } static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { } static inline void pci_configure_ltr(struct pci_dev *pdev) { } static inline void pci_bridge_reconfigure_ltr(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCIE_ECRC void pcie_set_ecrc_checking(struct pci_dev *dev); void pcie_ecrc_get_policy(char *str); #else static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { } static inline void pcie_ecrc_get_policy(char *str) { } #endif #ifdef CONFIG_PCIEPORTBUS void pcie_reset_lbms_count(struct pci_dev *port); int pcie_lbms_count(struct pci_dev *port, unsigned long *val); #else static inline void pcie_reset_lbms_count(struct pci_dev *port) {} static inline int pcie_lbms_count(struct pci_dev *port, unsigned long *val) { return -EOPNOTSUPP; } #endif struct pci_dev_reset_methods { u16 vendor; u16 device; int (*reset)(struct pci_dev *dev, bool probe); }; struct pci_reset_fn_method { int (*reset_fn)(struct pci_dev *pdev, bool probe); char *name; }; extern const struct pci_reset_fn_method pci_reset_fn_methods[]; #ifdef CONFIG_PCI_QUIRKS int pci_dev_specific_reset(struct pci_dev *dev, bool probe); #else static inline int pci_dev_specific_reset(struct pci_dev *dev, bool probe) { return -ENOTTY; } #endif #if defined(CONFIG_PCI_QUIRKS) && defined(CONFIG_ARM64) int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment, struct resource *res); #else static inline int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment, struct resource *res) { return -ENODEV; } #endif void pci_rebar_init(struct pci_dev *pdev); int pci_rebar_get_current_size(struct pci_dev *pdev, int bar); int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size); static inline u64 pci_rebar_size_to_bytes(int size) { return 1ULL << (size + 20); } struct device_node; #ifdef CONFIG_OF int of_get_pci_domain_nr(struct device_node *node); int of_pci_get_max_link_speed(struct device_node *node); u32 of_pci_get_slot_power_limit(struct device_node *node, u8 *slot_power_limit_value, u8 *slot_power_limit_scale); bool of_pci_preserve_config(struct device_node *node); int pci_set_of_node(struct pci_dev *dev); void pci_release_of_node(struct pci_dev *dev); void pci_set_bus_of_node(struct pci_bus *bus); void pci_release_bus_of_node(struct pci_bus *bus); int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge); bool of_pci_supply_present(struct device_node *np); #else static inline int of_get_pci_domain_nr(struct device_node *node) { return -1; } static inline int of_pci_get_max_link_speed(struct device_node *node) { return -EINVAL; } static inline u32 of_pci_get_slot_power_limit(struct device_node *node, u8 *slot_power_limit_value, u8 *slot_power_limit_scale) { if (slot_power_limit_value) *slot_power_limit_value = 0; if (slot_power_limit_scale) *slot_power_limit_scale = 0; return 0; } static inline bool of_pci_preserve_config(struct device_node *node) { return false; } static inline int pci_set_of_node(struct pci_dev *dev) { return 0; } static inline void pci_release_of_node(struct pci_dev *dev) { } static inline void pci_set_bus_of_node(struct pci_bus *bus) { } static inline void pci_release_bus_of_node(struct pci_bus *bus) { } static inline int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge) { return 0; } static inline bool of_pci_supply_present(struct device_node *np) { return false; } #endif /* CONFIG_OF */ struct of_changeset; #ifdef CONFIG_PCI_DYNAMIC_OF_NODES void of_pci_make_dev_node(struct pci_dev *pdev); void of_pci_remove_node(struct pci_dev *pdev); int of_pci_add_properties(struct pci_dev *pdev, struct of_changeset *ocs, struct device_node *np); void of_pci_make_host_bridge_node(struct pci_host_bridge *bridge); void of_pci_remove_host_bridge_node(struct pci_host_bridge *bridge); int of_pci_add_host_bridge_properties(struct pci_host_bridge *bridge, struct of_changeset *ocs, struct device_node *np); #else static inline void of_pci_make_dev_node(struct pci_dev *pdev) { } static inline void of_pci_remove_node(struct pci_dev *pdev) { } static inline void of_pci_make_host_bridge_node(struct pci_host_bridge *bridge) { } static inline void of_pci_remove_host_bridge_node(struct pci_host_bridge *bridge) { } #endif #ifdef CONFIG_PCIEAER void pci_no_aer(void); void pci_aer_init(struct pci_dev *dev); void pci_aer_exit(struct pci_dev *dev); extern const struct attribute_group aer_stats_attr_group; void pci_aer_clear_fatal_status(struct pci_dev *dev); int pci_aer_clear_status(struct pci_dev *dev); int pci_aer_raw_clear_status(struct pci_dev *dev); void pci_save_aer_state(struct pci_dev *dev); void pci_restore_aer_state(struct pci_dev *dev); #else static inline void pci_no_aer(void) { } static inline void pci_aer_init(struct pci_dev *d) { } static inline void pci_aer_exit(struct pci_dev *d) { } static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; } static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; } static inline void pci_save_aer_state(struct pci_dev *dev) { } static inline void pci_restore_aer_state(struct pci_dev *dev) { } #endif #ifdef CONFIG_ACPI bool pci_acpi_preserve_config(struct pci_host_bridge *bridge); int pci_acpi_program_hp_params(struct pci_dev *dev); extern const struct attribute_group pci_dev_acpi_attr_group; void pci_set_acpi_fwnode(struct pci_dev *dev); int pci_dev_acpi_reset(struct pci_dev *dev, bool probe); bool acpi_pci_power_manageable(struct pci_dev *dev); bool acpi_pci_bridge_d3(struct pci_dev *dev); int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state); pci_power_t acpi_pci_get_power_state(struct pci_dev *dev); void acpi_pci_refresh_power_state(struct pci_dev *dev); int acpi_pci_wakeup(struct pci_dev *dev, bool enable); bool acpi_pci_need_resume(struct pci_dev *dev); pci_power_t acpi_pci_choose_state(struct pci_dev *pdev); #else static inline bool pci_acpi_preserve_config(struct pci_host_bridge *bridge) { return false; } static inline int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) { return -ENOTTY; } static inline void pci_set_acpi_fwnode(struct pci_dev *dev) { } static inline int pci_acpi_program_hp_params(struct pci_dev *dev) { return -ENODEV; } static inline bool acpi_pci_power_manageable(struct pci_dev *dev) { return false; } static inline bool acpi_pci_bridge_d3(struct pci_dev *dev) { return false; } static inline int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) { return -ENODEV; } static inline pci_power_t acpi_pci_get_power_state(struct pci_dev *dev) { return PCI_UNKNOWN; } static inline void acpi_pci_refresh_power_state(struct pci_dev *dev) { } static inline int acpi_pci_wakeup(struct pci_dev *dev, bool enable) { return -ENODEV; } static inline bool acpi_pci_need_resume(struct pci_dev *dev) { return false; } static inline pci_power_t acpi_pci_choose_state(struct pci_dev *pdev) { return PCI_POWER_ERROR; } #endif #ifdef CONFIG_PCIEASPM extern const struct attribute_group aspm_ctrl_attr_group; #endif #ifdef CONFIG_X86_INTEL_MID bool pci_use_mid_pm(void); int mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state); pci_power_t mid_pci_get_power_state(struct pci_dev *pdev); #else static inline bool pci_use_mid_pm(void) { return false; } static inline int mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state) { return -ENODEV; } static inline pci_power_t mid_pci_get_power_state(struct pci_dev *pdev) { return PCI_UNKNOWN; } #endif int pcim_intx(struct pci_dev *dev, int enable); int pcim_request_region_exclusive(struct pci_dev *pdev, int bar, const char *name); void pcim_release_region(struct pci_dev *pdev, int bar); /* * Config Address for PCI Configuration Mechanism #1 * * See PCI Local Bus Specification, Revision 3.0, * Section 3.2.2.3.2, Figure 3-2, p. 50. */ #define PCI_CONF1_BUS_SHIFT 16 /* Bus number */ #define PCI_CONF1_DEV_SHIFT 11 /* Device number */ #define PCI_CONF1_FUNC_SHIFT 8 /* Function number */ #define PCI_CONF1_BUS_MASK 0xff #define PCI_CONF1_DEV_MASK 0x1f #define PCI_CONF1_FUNC_MASK 0x7 #define PCI_CONF1_REG_MASK 0xfc /* Limit aligned offset to a maximum of 256B */ #define PCI_CONF1_ENABLE BIT(31) #define PCI_CONF1_BUS(x) (((x) & PCI_CONF1_BUS_MASK) << PCI_CONF1_BUS_SHIFT) #define PCI_CONF1_DEV(x) (((x) & PCI_CONF1_DEV_MASK) << PCI_CONF1_DEV_SHIFT) #define PCI_CONF1_FUNC(x) (((x) & PCI_CONF1_FUNC_MASK) << PCI_CONF1_FUNC_SHIFT) #define PCI_CONF1_REG(x) ((x) & PCI_CONF1_REG_MASK) #define PCI_CONF1_ADDRESS(bus, dev, func, reg) \ (PCI_CONF1_ENABLE | \ PCI_CONF1_BUS(bus) | \ PCI_CONF1_DEV(dev) | \ PCI_CONF1_FUNC(func) | \ PCI_CONF1_REG(reg)) /* * Extension of PCI Config Address for accessing extended PCIe registers * * No standardized specification, but used on lot of non-ECAM-compliant ARM SoCs * or on AMD Barcelona and new CPUs. Reserved bits [27:24] of PCI Config Address * are used for specifying additional 4 high bits of PCI Express register. */ #define PCI_CONF1_EXT_REG_SHIFT 16 #define PCI_CONF1_EXT_REG_MASK 0xf00 #define PCI_CONF1_EXT_REG(x) (((x) & PCI_CONF1_EXT_REG_MASK) << PCI_CONF1_EXT_REG_SHIFT) #define PCI_CONF1_EXT_ADDRESS(bus, dev, func, reg) \ (PCI_CONF1_ADDRESS(bus, dev, func, reg) | \ PCI_CONF1_EXT_REG(reg)) #endif /* DRIVERS_PCI_H */ |
114 1 26 18 8 2 2 2 2 2 2 2 2 1 1 36 36 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 | /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2010 Nokia Corporation Copyright (C) 2011-2012 Intel Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth HCI Management interface */ #include <linux/module.h> #include <linux/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/hci_sock.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/mgmt.h> #include "smp.h" #include "mgmt_util.h" #include "mgmt_config.h" #include "msft.h" #include "eir.h" #include "aosp.h" #define MGMT_VERSION 1 #define MGMT_REVISION 23 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, MGMT_OP_READ_INFO, MGMT_OP_SET_POWERED, MGMT_OP_SET_DISCOVERABLE, MGMT_OP_SET_CONNECTABLE, MGMT_OP_SET_FAST_CONNECTABLE, MGMT_OP_SET_BONDABLE, MGMT_OP_SET_LINK_SECURITY, MGMT_OP_SET_SSP, MGMT_OP_SET_HS, MGMT_OP_SET_LE, MGMT_OP_SET_DEV_CLASS, MGMT_OP_SET_LOCAL_NAME, MGMT_OP_ADD_UUID, MGMT_OP_REMOVE_UUID, MGMT_OP_LOAD_LINK_KEYS, MGMT_OP_LOAD_LONG_TERM_KEYS, MGMT_OP_DISCONNECT, MGMT_OP_GET_CONNECTIONS, MGMT_OP_PIN_CODE_REPLY, MGMT_OP_PIN_CODE_NEG_REPLY, MGMT_OP_SET_IO_CAPABILITY, MGMT_OP_PAIR_DEVICE, MGMT_OP_CANCEL_PAIR_DEVICE, MGMT_OP_UNPAIR_DEVICE, MGMT_OP_USER_CONFIRM_REPLY, MGMT_OP_USER_CONFIRM_NEG_REPLY, MGMT_OP_USER_PASSKEY_REPLY, MGMT_OP_USER_PASSKEY_NEG_REPLY, MGMT_OP_READ_LOCAL_OOB_DATA, MGMT_OP_ADD_REMOTE_OOB_DATA, MGMT_OP_REMOVE_REMOTE_OOB_DATA, MGMT_OP_START_DISCOVERY, MGMT_OP_STOP_DISCOVERY, MGMT_OP_CONFIRM_NAME, MGMT_OP_BLOCK_DEVICE, MGMT_OP_UNBLOCK_DEVICE, MGMT_OP_SET_DEVICE_ID, MGMT_OP_SET_ADVERTISING, MGMT_OP_SET_BREDR, MGMT_OP_SET_STATIC_ADDRESS, MGMT_OP_SET_SCAN_PARAMS, MGMT_OP_SET_SECURE_CONN, MGMT_OP_SET_DEBUG_KEYS, MGMT_OP_SET_PRIVACY, MGMT_OP_LOAD_IRKS, MGMT_OP_GET_CONN_INFO, MGMT_OP_GET_CLOCK_INFO, MGMT_OP_ADD_DEVICE, MGMT_OP_REMOVE_DEVICE, MGMT_OP_LOAD_CONN_PARAM, MGMT_OP_READ_UNCONF_INDEX_LIST, MGMT_OP_READ_CONFIG_INFO, MGMT_OP_SET_EXTERNAL_CONFIG, MGMT_OP_SET_PUBLIC_ADDRESS, MGMT_OP_START_SERVICE_DISCOVERY, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, MGMT_OP_READ_EXT_INDEX_LIST, MGMT_OP_READ_ADV_FEATURES, MGMT_OP_ADD_ADVERTISING, MGMT_OP_REMOVE_ADVERTISING, MGMT_OP_GET_ADV_SIZE_INFO, MGMT_OP_START_LIMITED_DISCOVERY, MGMT_OP_READ_EXT_INFO, MGMT_OP_SET_APPEARANCE, MGMT_OP_GET_PHY_CONFIGURATION, MGMT_OP_SET_PHY_CONFIGURATION, MGMT_OP_SET_BLOCKED_KEYS, MGMT_OP_SET_WIDEBAND_SPEECH, MGMT_OP_READ_CONTROLLER_CAP, MGMT_OP_READ_EXP_FEATURES_INFO, MGMT_OP_SET_EXP_FEATURE, MGMT_OP_READ_DEF_SYSTEM_CONFIG, MGMT_OP_SET_DEF_SYSTEM_CONFIG, MGMT_OP_READ_DEF_RUNTIME_CONFIG, MGMT_OP_SET_DEF_RUNTIME_CONFIG, MGMT_OP_GET_DEVICE_FLAGS, MGMT_OP_SET_DEVICE_FLAGS, MGMT_OP_READ_ADV_MONITOR_FEATURES, MGMT_OP_ADD_ADV_PATTERNS_MONITOR, MGMT_OP_REMOVE_ADV_MONITOR, MGMT_OP_ADD_EXT_ADV_PARAMS, MGMT_OP_ADD_EXT_ADV_DATA, MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, MGMT_OP_SET_MESH_RECEIVER, MGMT_OP_MESH_READ_FEATURES, MGMT_OP_MESH_SEND, MGMT_OP_MESH_SEND_CANCEL, MGMT_OP_HCI_CMD_SYNC, }; static const u16 mgmt_events[] = { MGMT_EV_CONTROLLER_ERROR, MGMT_EV_INDEX_ADDED, MGMT_EV_INDEX_REMOVED, MGMT_EV_NEW_SETTINGS, MGMT_EV_CLASS_OF_DEV_CHANGED, MGMT_EV_LOCAL_NAME_CHANGED, MGMT_EV_NEW_LINK_KEY, MGMT_EV_NEW_LONG_TERM_KEY, MGMT_EV_DEVICE_CONNECTED, MGMT_EV_DEVICE_DISCONNECTED, MGMT_EV_CONNECT_FAILED, MGMT_EV_PIN_CODE_REQUEST, MGMT_EV_USER_CONFIRM_REQUEST, MGMT_EV_USER_PASSKEY_REQUEST, MGMT_EV_AUTH_FAILED, MGMT_EV_DEVICE_FOUND, MGMT_EV_DISCOVERING, MGMT_EV_DEVICE_BLOCKED, MGMT_EV_DEVICE_UNBLOCKED, MGMT_EV_DEVICE_UNPAIRED, MGMT_EV_PASSKEY_NOTIFY, MGMT_EV_NEW_IRK, MGMT_EV_NEW_CSRK, MGMT_EV_DEVICE_ADDED, MGMT_EV_DEVICE_REMOVED, MGMT_EV_NEW_CONN_PARAM, MGMT_EV_UNCONF_INDEX_ADDED, MGMT_EV_UNCONF_INDEX_REMOVED, MGMT_EV_NEW_CONFIG_OPTIONS, MGMT_EV_EXT_INDEX_ADDED, MGMT_EV_EXT_INDEX_REMOVED, MGMT_EV_LOCAL_OOB_DATA_UPDATED, MGMT_EV_ADVERTISING_ADDED, MGMT_EV_ADVERTISING_REMOVED, MGMT_EV_EXT_INFO_CHANGED, MGMT_EV_PHY_CONFIGURATION_CHANGED, MGMT_EV_EXP_FEATURE_CHANGED, MGMT_EV_DEVICE_FLAGS_CHANGED, MGMT_EV_ADV_MONITOR_ADDED, MGMT_EV_ADV_MONITOR_REMOVED, MGMT_EV_CONTROLLER_SUSPEND, MGMT_EV_CONTROLLER_RESUME, MGMT_EV_ADV_MONITOR_DEVICE_FOUND, MGMT_EV_ADV_MONITOR_DEVICE_LOST, }; static const u16 mgmt_untrusted_commands[] = { MGMT_OP_READ_INDEX_LIST, MGMT_OP_READ_INFO, MGMT_OP_READ_UNCONF_INDEX_LIST, MGMT_OP_READ_CONFIG_INFO, MGMT_OP_READ_EXT_INDEX_LIST, MGMT_OP_READ_EXT_INFO, MGMT_OP_READ_CONTROLLER_CAP, MGMT_OP_READ_EXP_FEATURES_INFO, MGMT_OP_READ_DEF_SYSTEM_CONFIG, MGMT_OP_READ_DEF_RUNTIME_CONFIG, }; static const u16 mgmt_untrusted_events[] = { MGMT_EV_INDEX_ADDED, MGMT_EV_INDEX_REMOVED, MGMT_EV_NEW_SETTINGS, MGMT_EV_CLASS_OF_DEV_CHANGED, MGMT_EV_LOCAL_NAME_CHANGED, MGMT_EV_UNCONF_INDEX_ADDED, MGMT_EV_UNCONF_INDEX_REMOVED, MGMT_EV_NEW_CONFIG_OPTIONS, MGMT_EV_EXT_INDEX_ADDED, MGMT_EV_EXT_INDEX_REMOVED, MGMT_EV_EXT_INFO_CHANGED, MGMT_EV_EXP_FEATURE_CHANGED, }; #define CACHE_TIMEOUT secs_to_jiffies(2) #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" /* HCI to MGMT error code conversion table */ static const u8 mgmt_status_table[] = { MGMT_STATUS_SUCCESS, MGMT_STATUS_UNKNOWN_COMMAND, /* Unknown Command */ MGMT_STATUS_NOT_CONNECTED, /* No Connection */ MGMT_STATUS_FAILED, /* Hardware Failure */ MGMT_STATUS_CONNECT_FAILED, /* Page Timeout */ MGMT_STATUS_AUTH_FAILED, /* Authentication Failed */ MGMT_STATUS_AUTH_FAILED, /* PIN or Key Missing */ MGMT_STATUS_NO_RESOURCES, /* Memory Full */ MGMT_STATUS_TIMEOUT, /* Connection Timeout */ MGMT_STATUS_NO_RESOURCES, /* Max Number of Connections */ MGMT_STATUS_NO_RESOURCES, /* Max Number of SCO Connections */ MGMT_STATUS_ALREADY_CONNECTED, /* ACL Connection Exists */ MGMT_STATUS_BUSY, /* Command Disallowed */ MGMT_STATUS_NO_RESOURCES, /* Rejected Limited Resources */ MGMT_STATUS_REJECTED, /* Rejected Security */ MGMT_STATUS_REJECTED, /* Rejected Personal */ MGMT_STATUS_TIMEOUT, /* Host Timeout */ MGMT_STATUS_NOT_SUPPORTED, /* Unsupported Feature */ MGMT_STATUS_INVALID_PARAMS, /* Invalid Parameters */ MGMT_STATUS_DISCONNECTED, /* OE User Ended Connection */ MGMT_STATUS_NO_RESOURCES, /* OE Low Resources */ MGMT_STATUS_DISCONNECTED, /* OE Power Off */ MGMT_STATUS_DISCONNECTED, /* Connection Terminated */ MGMT_STATUS_BUSY, /* Repeated Attempts */ MGMT_STATUS_REJECTED, /* Pairing Not Allowed */ MGMT_STATUS_FAILED, /* Unknown LMP PDU */ MGMT_STATUS_NOT_SUPPORTED, /* Unsupported Remote Feature */ MGMT_STATUS_REJECTED, /* SCO Offset Rejected */ MGMT_STATUS_REJECTED, /* SCO Interval Rejected */ MGMT_STATUS_REJECTED, /* Air Mode Rejected */ MGMT_STATUS_INVALID_PARAMS, /* Invalid LMP Parameters */ MGMT_STATUS_FAILED, /* Unspecified Error */ MGMT_STATUS_NOT_SUPPORTED, /* Unsupported LMP Parameter Value */ MGMT_STATUS_FAILED, /* Role Change Not Allowed */ MGMT_STATUS_TIMEOUT, /* LMP Response Timeout */ MGMT_STATUS_FAILED, /* LMP Error Transaction Collision */ MGMT_STATUS_FAILED, /* LMP PDU Not Allowed */ MGMT_STATUS_REJECTED, /* Encryption Mode Not Accepted */ MGMT_STATUS_FAILED, /* Unit Link Key Used */ MGMT_STATUS_NOT_SUPPORTED, /* QoS Not Supported */ MGMT_STATUS_TIMEOUT, /* Instant Passed */ MGMT_STATUS_NOT_SUPPORTED, /* Pairing Not Supported */ MGMT_STATUS_FAILED, /* Transaction Collision */ MGMT_STATUS_FAILED, /* Reserved for future use */ MGMT_STATUS_INVALID_PARAMS, /* Unacceptable Parameter */ MGMT_STATUS_REJECTED, /* QoS Rejected */ MGMT_STATUS_NOT_SUPPORTED, /* Classification Not Supported */ MGMT_STATUS_REJECTED, /* Insufficient Security */ MGMT_STATUS_INVALID_PARAMS, /* Parameter Out Of Range */ MGMT_STATUS_FAILED, /* Reserved for future use */ MGMT_STATUS_BUSY, /* Role Switch Pending */ MGMT_STATUS_FAILED, /* Reserved for future use */ MGMT_STATUS_FAILED, /* Slot Violation */ MGMT_STATUS_FAILED, /* Role Switch Failed */ MGMT_STATUS_INVALID_PARAMS, /* EIR Too Large */ MGMT_STATUS_NOT_SUPPORTED, /* Simple Pairing Not Supported */ MGMT_STATUS_BUSY, /* Host Busy Pairing */ MGMT_STATUS_REJECTED, /* Rejected, No Suitable Channel */ MGMT_STATUS_BUSY, /* Controller Busy */ MGMT_STATUS_INVALID_PARAMS, /* Unsuitable Connection Interval */ MGMT_STATUS_TIMEOUT, /* Directed Advertising Timeout */ MGMT_STATUS_AUTH_FAILED, /* Terminated Due to MIC Failure */ MGMT_STATUS_CONNECT_FAILED, /* Connection Establishment Failed */ MGMT_STATUS_CONNECT_FAILED, /* MAC Connection Failed */ }; static u8 mgmt_errno_status(int err) { switch (err) { case 0: return MGMT_STATUS_SUCCESS; case -EPERM: return MGMT_STATUS_REJECTED; case -EINVAL: return MGMT_STATUS_INVALID_PARAMS; case -EOPNOTSUPP: return MGMT_STATUS_NOT_SUPPORTED; case -EBUSY: return MGMT_STATUS_BUSY; case -ETIMEDOUT: return MGMT_STATUS_AUTH_FAILED; case -ENOMEM: return MGMT_STATUS_NO_RESOURCES; case -EISCONN: return MGMT_STATUS_ALREADY_CONNECTED; case -ENOTCONN: return MGMT_STATUS_DISCONNECTED; } return MGMT_STATUS_FAILED; } static u8 mgmt_status(int err) { if (err < 0) return mgmt_errno_status(err); if (err < ARRAY_SIZE(mgmt_status_table)) return mgmt_status_table[err]; return MGMT_STATUS_FAILED; } static int mgmt_index_event(u16 event, struct hci_dev *hdev, void *data, u16 len, int flag) { return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len, flag, NULL); } static int mgmt_limited_event(u16 event, struct hci_dev *hdev, void *data, u16 len, int flag, struct sock *skip_sk) { return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len, flag, skip_sk); } static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len, struct sock *skip_sk) { return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len, HCI_SOCK_TRUSTED, skip_sk); } static int mgmt_event_skb(struct sk_buff *skb, struct sock *skip_sk) { return mgmt_send_event_skb(HCI_CHANNEL_CONTROL, skb, HCI_SOCK_TRUSTED, skip_sk); } static u8 le_addr_type(u8 mgmt_addr_type) { if (mgmt_addr_type == BDADDR_LE_PUBLIC) return ADDR_LE_DEV_PUBLIC; else return ADDR_LE_DEV_RANDOM; } void mgmt_fill_version_info(void *ver) { struct mgmt_rp_read_version *rp = ver; rp->version = MGMT_VERSION; rp->revision = cpu_to_le16(MGMT_REVISION); } static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_rp_read_version rp; bt_dev_dbg(hdev, "sock %p", sk); mgmt_fill_version_info(&rp); return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0, &rp, sizeof(rp)); } static int read_commands(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_rp_read_commands *rp; u16 num_commands, num_events; size_t rp_size; int i, err; bt_dev_dbg(hdev, "sock %p", sk); if (hci_sock_test_flag(sk, HCI_SOCK_TRUSTED)) { num_commands = ARRAY_SIZE(mgmt_commands); num_events = ARRAY_SIZE(mgmt_events); } else { num_commands = ARRAY_SIZE(mgmt_untrusted_commands); num_events = ARRAY_SIZE(mgmt_untrusted_events); } rp_size = sizeof(*rp) + ((num_commands + num_events) * sizeof(u16)); rp = kmalloc(rp_size, GFP_KERNEL); if (!rp) return -ENOMEM; rp->num_commands = cpu_to_le16(num_commands); rp->num_events = cpu_to_le16(num_events); if (hci_sock_test_flag(sk, HCI_SOCK_TRUSTED)) { __le16 *opcode = rp->opcodes; for (i = 0; i < num_commands; i++, opcode++) put_unaligned_le16(mgmt_commands[i], opcode); for (i = 0; i < num_events; i++, opcode++) put_unaligned_le16(mgmt_events[i], opcode); } else { __le16 *opcode = rp->opcodes; for (i = 0; i < num_commands; i++, opcode++) put_unaligned_le16(mgmt_untrusted_commands[i], opcode); for (i = 0; i < num_events; i++, opcode++) put_unaligned_le16(mgmt_untrusted_events[i], opcode); } err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_COMMANDS, 0, rp, rp_size); kfree(rp); return err; } static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_rp_read_index_list *rp; struct hci_dev *d; size_t rp_len; u16 count; int err; bt_dev_dbg(hdev, "sock %p", sk); read_lock(&hci_dev_list_lock); count = 0; list_for_each_entry(d, &hci_dev_list, list) { if (!hci_dev_test_flag(d, HCI_UNCONFIGURED)) count++; } rp_len = sizeof(*rp) + (2 * count); rp = kmalloc(rp_len, GFP_ATOMIC); if (!rp) { read_unlock(&hci_dev_list_lock); return -ENOMEM; } count = 0; list_for_each_entry(d, &hci_dev_list, list) { if (hci_dev_test_flag(d, HCI_SETUP) || hci_dev_test_flag(d, HCI_CONFIG) || hci_dev_test_flag(d, HCI_USER_CHANNEL)) continue; /* Devices marked as raw-only are neither configured * nor unconfigured controllers. */ if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; if (!hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); bt_dev_dbg(hdev, "Added hci%u", d->id); } } rp->num_controllers = cpu_to_le16(count); rp_len = sizeof(*rp) + (2 * count); read_unlock(&hci_dev_list_lock); err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_INDEX_LIST, 0, rp, rp_len); kfree(rp); return err; } static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_rp_read_unconf_index_list *rp; struct hci_dev *d; size_t rp_len; u16 count; int err; bt_dev_dbg(hdev, "sock %p", sk); read_lock(&hci_dev_list_lock); count = 0; list_for_each_entry(d, &hci_dev_list, list) { if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) count++; } rp_len = sizeof(*rp) + (2 * count); rp = kmalloc(rp_len, GFP_ATOMIC); if (!rp) { read_unlock(&hci_dev_list_lock); return -ENOMEM; } count = 0; list_for_each_entry(d, &hci_dev_list, list) { if (hci_dev_test_flag(d, HCI_SETUP) || hci_dev_test_flag(d, HCI_CONFIG) || hci_dev_test_flag(d, HCI_USER_CHANNEL)) continue; /* Devices marked as raw-only are neither configured * nor unconfigured controllers. */ if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); bt_dev_dbg(hdev, "Added hci%u", d->id); } } rp->num_controllers = cpu_to_le16(count); rp_len = sizeof(*rp) + (2 * count); read_unlock(&hci_dev_list_lock); err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_UNCONF_INDEX_LIST, 0, rp, rp_len); kfree(rp); return err; } static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_rp_read_ext_index_list *rp; struct hci_dev *d; u16 count; int err; bt_dev_dbg(hdev, "sock %p", sk); read_lock(&hci_dev_list_lock); count = 0; list_for_each_entry(d, &hci_dev_list, list) count++; rp = kmalloc(struct_size(rp, entry, count), GFP_ATOMIC); if (!rp) { read_unlock(&hci_dev_list_lock); return -ENOMEM; } count = 0; list_for_each_entry(d, &hci_dev_list, list) { if (hci_dev_test_flag(d, HCI_SETUP) || hci_dev_test_flag(d, HCI_CONFIG) || hci_dev_test_flag(d, HCI_USER_CHANNEL)) continue; /* Devices marked as raw-only are neither configured * nor unconfigured controllers. */ if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) continue; if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) rp->entry[count].type = 0x01; else rp->entry[count].type = 0x00; rp->entry[count].bus = d->bus; rp->entry[count++].index = cpu_to_le16(d->id); bt_dev_dbg(hdev, "Added hci%u", d->id); } rp->num_controllers = cpu_to_le16(count); read_unlock(&hci_dev_list_lock); /* If this command is called at least once, then all the * default index and unconfigured index events are disabled * and from now on only extended index events are used. */ hci_sock_set_flag(sk, HCI_MGMT_EXT_INDEX_EVENTS); hci_sock_clear_flag(sk, HCI_MGMT_INDEX_EVENTS); hci_sock_clear_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_EXT_INDEX_LIST, 0, rp, struct_size(rp, entry, count)); kfree(rp); return err; } static bool is_configured(struct hci_dev *hdev) { if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) && !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED)) return false; if ((test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) || test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks)) && !bacmp(&hdev->public_addr, BDADDR_ANY)) return false; return true; } static __le32 get_missing_options(struct hci_dev *hdev) { u32 options = 0; if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) && !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED)) options |= MGMT_OPTION_EXTERNAL_CONFIG; if ((test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) || test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks)) && !bacmp(&hdev->public_addr, BDADDR_ANY)) options |= MGMT_OPTION_PUBLIC_ADDRESS; return cpu_to_le32(options); } static int new_options(struct hci_dev *hdev, struct sock *skip) { __le32 options = get_missing_options(hdev); return mgmt_limited_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options, sizeof(options), HCI_MGMT_OPTION_EVENTS, skip); } static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) { __le32 options = get_missing_options(hdev); return mgmt_cmd_complete(sk, hdev->id, opcode, 0, &options, sizeof(options)); } static int read_config_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_rp_read_config_info rp; u32 options = 0; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); memset(&rp, 0, sizeof(rp)); rp.manufacturer = cpu_to_le16(hdev->manufacturer); if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks)) options |= MGMT_OPTION_EXTERNAL_CONFIG; if (hdev->set_bdaddr) options |= MGMT_OPTION_PUBLIC_ADDRESS; rp.supported_options = cpu_to_le32(options); rp.missing_options = get_missing_options(hdev); hci_dev_unlock(hdev); return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_CONFIG_INFO, 0, &rp, sizeof(rp)); } static u32 get_supported_phys(struct hci_dev *hdev) { u32 supported_phys = 0; if (lmp_bredr_capable(hdev)) { supported_phys |= MGMT_PHY_BR_1M_1SLOT; if (hdev->features[0][0] & LMP_3SLOT) supported_phys |= MGMT_PHY_BR_1M_3SLOT; if (hdev->features[0][0] & LMP_5SLOT) supported_phys |= MGMT_PHY_BR_1M_5SLOT; if (lmp_edr_2m_capable(hdev)) { supported_phys |= MGMT_PHY_EDR_2M_1SLOT; if (lmp_edr_3slot_capable(hdev)) supported_phys |= MGMT_PHY_EDR_2M_3SLOT; if (lmp_edr_5slot_capable(hdev)) supported_phys |= MGMT_PHY_EDR_2M_5SLOT; if (lmp_edr_3m_capable(hdev)) { supported_phys |= MGMT_PHY_EDR_3M_1SLOT; if (lmp_edr_3slot_capable(hdev)) supported_phys |= MGMT_PHY_EDR_3M_3SLOT; if (lmp_edr_5slot_capable(hdev)) supported_phys |= MGMT_PHY_EDR_3M_5SLOT; } } } if (lmp_le_capable(hdev)) { supported_phys |= MGMT_PHY_LE_1M_TX; supported_phys |= MGMT_PHY_LE_1M_RX; if (hdev->le_features[1] & HCI_LE_PHY_2M) { supported_phys |= MGMT_PHY_LE_2M_TX; supported_phys |= MGMT_PHY_LE_2M_RX; } if (hdev->le_features[1] & HCI_LE_PHY_CODED) { supported_phys |= MGMT_PHY_LE_CODED_TX; supported_phys |= MGMT_PHY_LE_CODED_RX; } } return supported_phys; } static u32 get_selected_phys(struct hci_dev *hdev) { u32 selected_phys = 0; if (lmp_bredr_capable(hdev)) { selected_phys |= MGMT_PHY_BR_1M_1SLOT; if (hdev->pkt_type & (HCI_DM3 | HCI_DH3)) selected_phys |= MGMT_PHY_BR_1M_3SLOT; if (hdev->pkt_type & (HCI_DM5 | HCI_DH5)) selected_phys |= MGMT_PHY_BR_1M_5SLOT; if (lmp_edr_2m_capable(hdev)) { if (!(hdev->pkt_type & HCI_2DH1)) selected_phys |= MGMT_PHY_EDR_2M_1SLOT; if (lmp_edr_3slot_capable(hdev) && !(hdev->pkt_type & HCI_2DH3)) selected_phys |= MGMT_PHY_EDR_2M_3SLOT; if (lmp_edr_5slot_capable(hdev) && !(hdev->pkt_type & HCI_2DH5)) selected_phys |= MGMT_PHY_EDR_2M_5SLOT; if (lmp_edr_3m_capable(hdev)) { if (!(hdev->pkt_type & HCI_3DH1)) selected_phys |= MGMT_PHY_EDR_3M_1SLOT; if (lmp_edr_3slot_capable(hdev) && !(hdev->pkt_type & HCI_3DH3)) selected_phys |= MGMT_PHY_EDR_3M_3SLOT; if (lmp_edr_5slot_capable(hdev) && !(hdev->pkt_type & HCI_3DH5)) selected_phys |= MGMT_PHY_EDR_3M_5SLOT; } } } if (lmp_le_capable(hdev)) { if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_1M) selected_phys |= MGMT_PHY_LE_1M_TX; if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_1M) selected_phys |= MGMT_PHY_LE_1M_RX; if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_2M) selected_phys |= MGMT_PHY_LE_2M_TX; if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_2M) selected_phys |= MGMT_PHY_LE_2M_RX; if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_CODED) selected_phys |= MGMT_PHY_LE_CODED_TX; if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_CODED) selected_phys |= MGMT_PHY_LE_CODED_RX; } return selected_phys; } static u32 get_configurable_phys(struct hci_dev *hdev) { return (get_supported_phys(hdev) & ~MGMT_PHY_BR_1M_1SLOT & ~MGMT_PHY_LE_1M_TX & ~MGMT_PHY_LE_1M_RX); } static u32 get_supported_settings(struct hci_dev *hdev) { u32 settings = 0; settings |= MGMT_SETTING_POWERED; settings |= MGMT_SETTING_BONDABLE; settings |= MGMT_SETTING_DEBUG_KEYS; settings |= MGMT_SETTING_CONNECTABLE; settings |= MGMT_SETTING_DISCOVERABLE; if (lmp_bredr_capable(hdev)) { if (hdev->hci_ver >= BLUETOOTH_VER_1_2) settings |= MGMT_SETTING_FAST_CONNECTABLE; settings |= MGMT_SETTING_BREDR; settings |= MGMT_SETTING_LINK_SECURITY; if (lmp_ssp_capable(hdev)) { settings |= MGMT_SETTING_SSP; } if (lmp_sc_capable(hdev)) settings |= MGMT_SETTING_SECURE_CONN; if (test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks)) settings |= MGMT_SETTING_WIDEBAND_SPEECH; } if (lmp_le_capable(hdev)) { settings |= MGMT_SETTING_LE; settings |= MGMT_SETTING_SECURE_CONN; settings |= MGMT_SETTING_PRIVACY; settings |= MGMT_SETTING_STATIC_ADDRESS; settings |= MGMT_SETTING_ADVERTISING; } if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || hdev->set_bdaddr) settings |= MGMT_SETTING_CONFIGURATION; if (cis_central_capable(hdev)) settings |= MGMT_SETTING_CIS_CENTRAL; if (cis_peripheral_capable(hdev)) settings |= MGMT_SETTING_CIS_PERIPHERAL; if (ll_privacy_capable(hdev)) settings |= MGMT_SETTING_LL_PRIVACY; settings |= MGMT_SETTING_PHY_CONFIGURATION; return settings; } static u32 get_current_settings(struct hci_dev *hdev) { u32 settings = 0; if (hdev_is_powered(hdev)) settings |= MGMT_SETTING_POWERED; if (hci_dev_test_flag(hdev, HCI_CONNECTABLE)) settings |= MGMT_SETTING_CONNECTABLE; if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE)) settings |= MGMT_SETTING_FAST_CONNECTABLE; if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) settings |= MGMT_SETTING_DISCOVERABLE; if (hci_dev_test_flag(hdev, HCI_BONDABLE)) settings |= MGMT_SETTING_BONDABLE; if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) settings |= MGMT_SETTING_BREDR; if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) settings |= MGMT_SETTING_LE; if (hci_dev_test_flag(hdev, HCI_LINK_SECURITY)) settings |= MGMT_SETTING_LINK_SECURITY; if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) settings |= MGMT_SETTING_SSP; if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) settings |= MGMT_SETTING_ADVERTISING; if (hci_dev_test_flag(hdev, HCI_SC_ENABLED)) settings |= MGMT_SETTING_SECURE_CONN; if (hci_dev_test_flag(hdev, HCI_KEEP_DEBUG_KEYS)) settings |= MGMT_SETTING_DEBUG_KEYS; if (hci_dev_test_flag(hdev, HCI_PRIVACY)) settings |= MGMT_SETTING_PRIVACY; /* The current setting for static address has two purposes. The * first is to indicate if the static address will be used and * the second is to indicate if it is actually set. * * This means if the static address is not configured, this flag * will never be set. If the address is configured, then if the * address is actually used decides if the flag is set or not. * * For single mode LE only controllers and dual-mode controllers * with BR/EDR disabled, the existence of the static address will * be evaluated. */ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) || !bacmp(&hdev->bdaddr, BDADDR_ANY)) { if (bacmp(&hdev->static_addr, BDADDR_ANY)) settings |= MGMT_SETTING_STATIC_ADDRESS; } if (hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) settings |= MGMT_SETTING_WIDEBAND_SPEECH; if (cis_central_capable(hdev)) settings |= MGMT_SETTING_CIS_CENTRAL; if (cis_peripheral_capable(hdev)) settings |= MGMT_SETTING_CIS_PERIPHERAL; if (bis_capable(hdev)) settings |= MGMT_SETTING_ISO_BROADCASTER; if (sync_recv_capable(hdev)) settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; if (ll_privacy_capable(hdev)) settings |= MGMT_SETTING_LL_PRIVACY; return settings; } static struct mgmt_pending_cmd *pending_find(u16 opcode, struct hci_dev *hdev) { return mgmt_pending_find(HCI_CHANNEL_CONTROL, opcode, hdev); } u8 mgmt_get_adv_discov_flags(struct hci_dev *hdev) { struct mgmt_pending_cmd *cmd; /* If there's a pending mgmt command the flags will not yet have * their final values, so check for this first. */ cmd = pending_find(MGMT_OP_SET_DISCOVERABLE, hdev); if (cmd) { struct mgmt_mode *cp = cmd->param; if (cp->val == 0x01) return LE_AD_GENERAL; else if (cp->val == 0x02) return LE_AD_LIMITED; } else { if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) return LE_AD_LIMITED; else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) return LE_AD_GENERAL; } return 0; } bool mgmt_get_connectable(struct hci_dev *hdev) { struct mgmt_pending_cmd *cmd; /* If there's a pending mgmt command the flag will not yet have * it's final value, so check for this first. */ cmd = pending_find(MGMT_OP_SET_CONNECTABLE, hdev); if (cmd) { struct mgmt_mode *cp = cmd->param; return cp->val; } return hci_dev_test_flag(hdev, HCI_CONNECTABLE); } static int service_cache_sync(struct hci_dev *hdev, void *data) { hci_update_eir_sync(hdev); hci_update_class_sync(hdev); return 0; } static void service_cache_off(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, service_cache.work); if (!hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) return; hci_cmd_sync_queue(hdev, service_cache_sync, NULL, NULL); } static int rpa_expired_sync(struct hci_dev *hdev, void *data) { /* The generation of a new RPA and programming it into the * controller happens in the hci_req_enable_advertising() * function. */ if (ext_adv_capable(hdev)) return hci_start_ext_adv_sync(hdev, hdev->cur_adv_instance); else return hci_enable_advertising_sync(hdev); } static void rpa_expired(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, rpa_expired.work); bt_dev_dbg(hdev, ""); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); if (!hci_dev_test_flag(hdev, HCI_ADVERTISING)) return; hci_cmd_sync_queue(hdev, rpa_expired_sync, NULL, NULL); } static int set_discoverable_sync(struct hci_dev *hdev, void *data); static void discov_off(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, discov_off.work); bt_dev_dbg(hdev, ""); hci_dev_lock(hdev); /* When discoverable timeout triggers, then just make sure * the limited discoverable flag is cleared. Even in the case * of a timeout triggered from general discoverable, it is * safe to unconditionally clear the flag. */ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); hdev->discov_timeout = 0; hci_cmd_sync_queue(hdev, set_discoverable_sync, NULL, NULL); mgmt_new_settings(hdev); hci_dev_unlock(hdev); } static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev); static void mesh_send_complete(struct hci_dev *hdev, struct mgmt_mesh_tx *mesh_tx, bool silent) { u8 handle = mesh_tx->handle; if (!silent) mgmt_event(MGMT_EV_MESH_PACKET_CMPLT, hdev, &handle, sizeof(handle), NULL); mgmt_mesh_remove(mesh_tx); } static int mesh_send_done_sync(struct hci_dev *hdev, void *data) { struct mgmt_mesh_tx *mesh_tx; hci_dev_clear_flag(hdev, HCI_MESH_SENDING); hci_disable_advertising_sync(hdev); mesh_tx = mgmt_mesh_next(hdev, NULL); if (mesh_tx) mesh_send_complete(hdev, mesh_tx, false); return 0; } static int mesh_send_sync(struct hci_dev *hdev, void *data); static void mesh_send_start_complete(struct hci_dev *hdev, void *data, int err); static void mesh_next(struct hci_dev *hdev, void *data, int err) { struct mgmt_mesh_tx *mesh_tx = mgmt_mesh_next(hdev, NULL); if (!mesh_tx) return; err = hci_cmd_sync_queue(hdev, mesh_send_sync, mesh_tx, mesh_send_start_complete); if (err < 0) mesh_send_complete(hdev, mesh_tx, false); else hci_dev_set_flag(hdev, HCI_MESH_SENDING); } static void mesh_send_done(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, mesh_send_done.work); if (!hci_dev_test_flag(hdev, HCI_MESH_SENDING)) return; hci_cmd_sync_queue(hdev, mesh_send_done_sync, NULL, mesh_next); } static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev) { if (hci_dev_test_flag(hdev, HCI_MGMT)) return; BT_INFO("MGMT ver %d.%d", MGMT_VERSION, MGMT_REVISION); INIT_DELAYED_WORK(&hdev->discov_off, discov_off); INIT_DELAYED_WORK(&hdev->service_cache, service_cache_off); INIT_DELAYED_WORK(&hdev->rpa_expired, rpa_expired); INIT_DELAYED_WORK(&hdev->mesh_send_done, mesh_send_done); /* Non-mgmt controlled devices get this bit set * implicitly so that pairing works for them, however * for mgmt we require user-space to explicitly enable * it */ hci_dev_clear_flag(hdev, HCI_BONDABLE); hci_dev_set_flag(hdev, HCI_MGMT); } static int read_controller_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_rp_read_info rp; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); memset(&rp, 0, sizeof(rp)); bacpy(&rp.bdaddr, &hdev->bdaddr); rp.version = hdev->hci_ver; rp.manufacturer = cpu_to_le16(hdev->manufacturer); rp.supported_settings = cpu_to_le32(get_supported_settings(hdev)); rp.current_settings = cpu_to_le32(get_current_settings(hdev)); memcpy(rp.dev_class, hdev->dev_class, 3); memcpy(rp.name, hdev->dev_name, sizeof(hdev->dev_name)); memcpy(rp.short_name, hdev->short_name, sizeof(hdev->short_name)); hci_dev_unlock(hdev); return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_INFO, 0, &rp, sizeof(rp)); } static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir) { u16 eir_len = 0; size_t name_len; if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) eir_len = eir_append_data(eir, eir_len, EIR_CLASS_OF_DEV, hdev->dev_class, 3); if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) eir_len = eir_append_le16(eir, eir_len, EIR_APPEARANCE, hdev->appearance); name_len = strnlen(hdev->dev_name, sizeof(hdev->dev_name)); eir_len = eir_append_data(eir, eir_len, EIR_NAME_COMPLETE, hdev->dev_name, name_len); name_len = strnlen(hdev->short_name, sizeof(hdev->short_name)); eir_len = eir_append_data(eir, eir_len, EIR_NAME_SHORT, hdev->short_name, name_len); return eir_len; } static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { char buf[512]; struct mgmt_rp_read_ext_info *rp = (void *)buf; u16 eir_len; bt_dev_dbg(hdev, "sock %p", sk); memset(&buf, 0, sizeof(buf)); hci_dev_lock(hdev); bacpy(&rp->bdaddr, &hdev->bdaddr); rp->version = hdev->hci_ver; rp->manufacturer = cpu_to_le16(hdev->manufacturer); rp->supported_settings = cpu_to_le32(get_supported_settings(hdev)); rp->current_settings = cpu_to_le32(get_current_settings(hdev)); eir_len = append_eir_data_to_buf(hdev, rp->eir); rp->eir_len = cpu_to_le16(eir_len); hci_dev_unlock(hdev); /* If this command is called at least once, then the events * for class of device and local name changes are disabled * and only the new extended controller information event * is used. */ hci_sock_set_flag(sk, HCI_MGMT_EXT_INFO_EVENTS); hci_sock_clear_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS); hci_sock_clear_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS); return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, rp, sizeof(*rp) + eir_len); } static int ext_info_changed(struct hci_dev *hdev, struct sock *skip) { char buf[512]; struct mgmt_ev_ext_info_changed *ev = (void *)buf; u16 eir_len; memset(buf, 0, sizeof(buf)); eir_len = append_eir_data_to_buf(hdev, ev->eir); ev->eir_len = cpu_to_le16(eir_len); return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, ev, sizeof(*ev) + eir_len, HCI_MGMT_EXT_INFO_EVENTS, skip); } static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) { __le32 settings = cpu_to_le32(get_current_settings(hdev)); return mgmt_cmd_complete(sk, hdev->id, opcode, 0, &settings, sizeof(settings)); } void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance) { struct mgmt_ev_advertising_added ev; ev.instance = instance; mgmt_event(MGMT_EV_ADVERTISING_ADDED, hdev, &ev, sizeof(ev), sk); } void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, u8 instance) { struct mgmt_ev_advertising_removed ev; ev.instance = instance; mgmt_event(MGMT_EV_ADVERTISING_REMOVED, hdev, &ev, sizeof(ev), sk); } static void cancel_adv_timeout(struct hci_dev *hdev) { if (hdev->adv_instance_timeout) { hdev->adv_instance_timeout = 0; cancel_delayed_work(&hdev->adv_instance_expire); } } /* This function requires the caller holds hdev->lock */ static void restart_le_actions(struct hci_dev *hdev) { struct hci_conn_params *p; list_for_each_entry(p, &hdev->le_conn_params, list) { /* Needed for AUTO_OFF case where might not "really" * have been powered off. */ hci_pend_le_list_del_init(p); switch (p->auto_connect) { case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: hci_pend_le_list_add(p, &hdev->pend_le_conns); break; case HCI_AUTO_CONN_REPORT: hci_pend_le_list_add(p, &hdev->pend_le_reports); break; default: break; } } } static int new_settings(struct hci_dev *hdev, struct sock *skip) { __le32 ev = cpu_to_le32(get_current_settings(hdev)); return mgmt_limited_event(MGMT_EV_NEW_SETTINGS, hdev, &ev, sizeof(ev), HCI_MGMT_SETTING_EVENTS, skip); } static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; struct mgmt_mode *cp; /* Make sure cmd still outstanding. */ if (err == -ECANCELED || cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) return; cp = cmd->param; bt_dev_dbg(hdev, "err %d", err); if (!err) { if (cp->val) { hci_dev_lock(hdev); restart_le_actions(hdev); hci_update_passive_scan(hdev); hci_dev_unlock(hdev); } send_settings_rsp(cmd->sk, cmd->opcode, hdev); /* Only call new_setting for power on as power off is deferred * to hdev->power_off work which does call hci_dev_do_close. */ if (cp->val) new_settings(hdev, cmd->sk); } else { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_POWERED, mgmt_status(err)); } mgmt_pending_remove(cmd); } static int set_powered_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_mode *cp; /* Make sure cmd still outstanding. */ if (cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) return -ECANCELED; cp = cmd->param; BT_DBG("%s", hdev->name); return hci_set_powered_sync(hdev, cp->val); } static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; struct mgmt_pending_cmd *cmd; int err; bt_dev_dbg(hdev, "sock %p", sk); if (cp->val != 0x00 && cp->val != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (!cp->val) { if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, MGMT_STATUS_BUSY); goto failed; } } if (pending_find(MGMT_OP_SET_POWERED, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, MGMT_STATUS_BUSY); goto failed; } if (!!cp->val == hdev_is_powered(hdev)) { err = send_settings_rsp(sk, MGMT_OP_SET_POWERED, hdev); goto failed; } cmd = mgmt_pending_add(sk, MGMT_OP_SET_POWERED, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; } /* Cancel potentially blocking sync operation before power off */ if (cp->val == 0x00) { hci_cmd_sync_cancel_sync(hdev, -EHOSTDOWN); err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd, mgmt_set_powered_complete); } else { /* Use hci_cmd_sync_submit since hdev might not be running */ err = hci_cmd_sync_submit(hdev, set_powered_sync, cmd, mgmt_set_powered_complete); } if (err < 0) mgmt_pending_remove(cmd); failed: hci_dev_unlock(hdev); return err; } int mgmt_new_settings(struct hci_dev *hdev) { return new_settings(hdev, NULL); } struct cmd_lookup { struct sock *sk; struct hci_dev *hdev; u8 mgmt_status; }; static void settings_rsp(struct mgmt_pending_cmd *cmd, void *data) { struct cmd_lookup *match = data; send_settings_rsp(cmd->sk, cmd->opcode, match->hdev); list_del(&cmd->list); if (match->sk == NULL) { match->sk = cmd->sk; sock_hold(match->sk); } mgmt_pending_free(cmd); } static void cmd_status_rsp(struct mgmt_pending_cmd *cmd, void *data) { u8 *status = data; mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, *status); mgmt_pending_remove(cmd); } static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data) { struct cmd_lookup *match = data; /* dequeue cmd_sync entries using cmd as data as that is about to be * removed/freed. */ hci_cmd_sync_dequeue(match->hdev, NULL, cmd, NULL); if (cmd->cmd_complete) { cmd->cmd_complete(cmd, match->mgmt_status); mgmt_pending_remove(cmd); return; } cmd_status_rsp(cmd, data); } static int generic_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) { return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, cmd->param, cmd->param_len); } static int addr_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) { return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, cmd->param, sizeof(struct mgmt_addr_info)); } static u8 mgmt_bredr_support(struct hci_dev *hdev) { if (!lmp_bredr_capable(hdev)) return MGMT_STATUS_NOT_SUPPORTED; else if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return MGMT_STATUS_REJECTED; else return MGMT_STATUS_SUCCESS; } static u8 mgmt_le_support(struct hci_dev *hdev) { if (!lmp_le_capable(hdev)) return MGMT_STATUS_NOT_SUPPORTED; else if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return MGMT_STATUS_REJECTED; else return MGMT_STATUS_SUCCESS; } static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; bt_dev_dbg(hdev, "err %d", err); /* Make sure cmd still outstanding. */ if (err == -ECANCELED || cmd != pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) return; hci_dev_lock(hdev); if (err) { u8 mgmt_err = mgmt_status(err); mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); goto done; } if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE) && hdev->discov_timeout > 0) { int to = secs_to_jiffies(hdev->discov_timeout); queue_delayed_work(hdev->req_workqueue, &hdev->discov_off, to); } send_settings_rsp(cmd->sk, MGMT_OP_SET_DISCOVERABLE, hdev); new_settings(hdev, cmd->sk); done: mgmt_pending_remove(cmd); hci_dev_unlock(hdev); } static int set_discoverable_sync(struct hci_dev *hdev, void *data) { BT_DBG("%s", hdev->name); return hci_update_discoverable_sync(hdev); } static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_discoverable *cp = data; struct mgmt_pending_cmd *cmd; u16 timeout; int err; bt_dev_dbg(hdev, "sock %p", sk); if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) && !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, MGMT_STATUS_REJECTED); if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, MGMT_STATUS_INVALID_PARAMS); timeout = __le16_to_cpu(cp->timeout); /* Disabling discoverable requires that no timeout is set, * and enabling limited discoverable requires a timeout. */ if ((cp->val == 0x00 && timeout > 0) || (cp->val == 0x02 && timeout == 0)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (!hdev_is_powered(hdev) && timeout > 0) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, MGMT_STATUS_NOT_POWERED); goto failed; } if (pending_find(MGMT_OP_SET_DISCOVERABLE, hdev) || pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, MGMT_STATUS_BUSY); goto failed; } if (!hci_dev_test_flag(hdev, HCI_CONNECTABLE)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, MGMT_STATUS_REJECTED); goto failed; } if (hdev->advertising_paused) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE, MGMT_STATUS_BUSY); goto failed; } if (!hdev_is_powered(hdev)) { bool changed = false; /* Setting limited discoverable when powered off is * not a valid operation since it requires a timeout * and so no need to check HCI_LIMITED_DISCOVERABLE. */ if (!!cp->val != hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) { hci_dev_change_flag(hdev, HCI_DISCOVERABLE); changed = true; } err = send_settings_rsp(sk, MGMT_OP_SET_DISCOVERABLE, hdev); if (err < 0) goto failed; if (changed) err = new_settings(hdev, sk); goto failed; } /* If the current mode is the same, then just update the timeout * value with the new value. And if only the timeout gets updated, * then no need for any HCI transactions. */ if (!!cp->val == hci_dev_test_flag(hdev, HCI_DISCOVERABLE) && (cp->val == 0x02) == hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) { cancel_delayed_work(&hdev->discov_off); hdev->discov_timeout = timeout; if (cp->val && hdev->discov_timeout > 0) { int to = secs_to_jiffies(hdev->discov_timeout); queue_delayed_work(hdev->req_workqueue, &hdev->discov_off, to); } err = send_settings_rsp(sk, MGMT_OP_SET_DISCOVERABLE, hdev); goto failed; } cmd = mgmt_pending_add(sk, MGMT_OP_SET_DISCOVERABLE, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; } /* Cancel any potential discoverable timeout that might be * still active and store new timeout value. The arming of * the timeout happens in the complete handler. */ cancel_delayed_work(&hdev->discov_off); hdev->discov_timeout = timeout; if (cp->val) hci_dev_set_flag(hdev, HCI_DISCOVERABLE); else hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); /* Limited discoverable mode */ if (cp->val == 0x02) hci_dev_set_flag(hdev, HCI_LIMITED_DISCOVERABLE); else hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); err = hci_cmd_sync_queue(hdev, set_discoverable_sync, cmd, mgmt_set_discoverable_complete); if (err < 0) mgmt_pending_remove(cmd); failed: hci_dev_unlock(hdev); return err; } static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; bt_dev_dbg(hdev, "err %d", err); /* Make sure cmd still outstanding. */ if (err == -ECANCELED || cmd != pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) return; hci_dev_lock(hdev); if (err) { u8 mgmt_err = mgmt_status(err); mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); goto done; } send_settings_rsp(cmd->sk, MGMT_OP_SET_CONNECTABLE, hdev); new_settings(hdev, cmd->sk); done: mgmt_pending_remove(cmd); hci_dev_unlock(hdev); } static int set_connectable_update_settings(struct hci_dev *hdev, struct sock *sk, u8 val) { bool changed = false; int err; if (!!val != hci_dev_test_flag(hdev, HCI_CONNECTABLE)) changed = true; if (val) { hci_dev_set_flag(hdev, HCI_CONNECTABLE); } else { hci_dev_clear_flag(hdev, HCI_CONNECTABLE); hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); } err = send_settings_rsp(sk, MGMT_OP_SET_CONNECTABLE, hdev); if (err < 0) return err; if (changed) { hci_update_scan(hdev); hci_update_passive_scan(hdev); return new_settings(hdev, sk); } return 0; } static int set_connectable_sync(struct hci_dev *hdev, void *data) { BT_DBG("%s", hdev->name); return hci_update_connectable_sync(hdev); } static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; struct mgmt_pending_cmd *cmd; int err; bt_dev_dbg(hdev, "sock %p", sk); if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) && !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE, MGMT_STATUS_REJECTED); if (cp->val != 0x00 && cp->val != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { err = set_connectable_update_settings(hdev, sk, cp->val); goto failed; } if (pending_find(MGMT_OP_SET_DISCOVERABLE, hdev) || pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_CONNECTABLE, MGMT_STATUS_BUSY); goto failed; } cmd = mgmt_pending_add(sk, MGMT_OP_SET_CONNECTABLE, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; } if (cp->val) { hci_dev_set_flag(hdev, HCI_CONNECTABLE); } else { if (hdev->discov_timeout > 0) cancel_delayed_work(&hdev->discov_off); hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); hci_dev_clear_flag(hdev, HCI_CONNECTABLE); } err = hci_cmd_sync_queue(hdev, set_connectable_sync, cmd, mgmt_set_connectable_complete); if (err < 0) mgmt_pending_remove(cmd); failed: hci_dev_unlock(hdev); return err; } static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; bool changed; int err; bt_dev_dbg(hdev, "sock %p", sk); if (cp->val != 0x00 && cp->val != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BONDABLE, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (cp->val) changed = !hci_dev_test_and_set_flag(hdev, HCI_BONDABLE); else changed = hci_dev_test_and_clear_flag(hdev, HCI_BONDABLE); err = send_settings_rsp(sk, MGMT_OP_SET_BONDABLE, hdev); if (err < 0) goto unlock; if (changed) { /* In limited privacy mode the change of bondable mode * may affect the local advertising address. */ hci_update_discoverable(hdev); err = new_settings(hdev, sk); } unlock: hci_dev_unlock(hdev); return err; } static int set_link_security(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; struct mgmt_pending_cmd *cmd; u8 val, status; int err; bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_bredr_support(hdev); if (status) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY, status); if (cp->val != 0x00 && cp->val != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { bool changed = false; if (!!cp->val != hci_dev_test_flag(hdev, HCI_LINK_SECURITY)) { hci_dev_change_flag(hdev, HCI_LINK_SECURITY); changed = true; } err = send_settings_rsp(sk, MGMT_OP_SET_LINK_SECURITY, hdev); if (err < 0) goto failed; if (changed) err = new_settings(hdev, sk); goto failed; } if (pending_find(MGMT_OP_SET_LINK_SECURITY, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LINK_SECURITY, MGMT_STATUS_BUSY); goto failed; } val = !!cp->val; if (test_bit(HCI_AUTH, &hdev->flags) == val) { err = send_settings_rsp(sk, MGMT_OP_SET_LINK_SECURITY, hdev); goto failed; } cmd = mgmt_pending_add(sk, MGMT_OP_SET_LINK_SECURITY, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; } err = hci_send_cmd(hdev, HCI_OP_WRITE_AUTH_ENABLE, sizeof(val), &val); if (err < 0) { mgmt_pending_remove(cmd); goto failed; } failed: hci_dev_unlock(hdev); return err; } static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) { struct cmd_lookup match = { NULL, hdev }; struct mgmt_pending_cmd *cmd = data; struct mgmt_mode *cp = cmd->param; u8 enable = cp->val; bool changed; /* Make sure cmd still outstanding. */ if (err == -ECANCELED || cmd != pending_find(MGMT_OP_SET_SSP, hdev)) return; if (err) { u8 mgmt_err = mgmt_status(err); if (enable && hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED)) { new_settings(hdev, NULL); } mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, cmd_status_rsp, &mgmt_err); return; } if (enable) { changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED); } else { changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED); } mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, settings_rsp, &match); if (changed) new_settings(hdev, match.sk); if (match.sk) sock_put(match.sk); hci_update_eir_sync(hdev); } static int set_ssp_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_mode *cp = cmd->param; bool changed = false; int err; if (cp->val) changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED); err = hci_write_ssp_mode_sync(hdev, cp->val); if (!err && changed) hci_dev_clear_flag(hdev, HCI_SSP_ENABLED); return err; } static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; struct mgmt_pending_cmd *cmd; u8 status; int err; bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_bredr_support(hdev); if (status) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, status); if (!lmp_ssp_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, MGMT_STATUS_NOT_SUPPORTED); if (cp->val != 0x00 && cp->val != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { bool changed; if (cp->val) { changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED); } else { changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED); } err = send_settings_rsp(sk, MGMT_OP_SET_SSP, hdev); if (err < 0) goto failed; if (changed) err = new_settings(hdev, sk); goto failed; } if (pending_find(MGMT_OP_SET_SSP, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, MGMT_STATUS_BUSY); goto failed; } if (!!cp->val == hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) { err = send_settings_rsp(sk, MGMT_OP_SET_SSP, hdev); goto failed; } cmd = mgmt_pending_add(sk, MGMT_OP_SET_SSP, hdev, data, len); if (!cmd) err = -ENOMEM; else err = hci_cmd_sync_queue(hdev, set_ssp_sync, cmd, set_ssp_complete); if (err < 0) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP, MGMT_STATUS_FAILED); if (cmd) mgmt_pending_remove(cmd); } failed: hci_dev_unlock(hdev); return err; } static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { bt_dev_dbg(hdev, "sock %p", sk); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, MGMT_STATUS_NOT_SUPPORTED); } static void set_le_complete(struct hci_dev *hdev, void *data, int err) { struct cmd_lookup match = { NULL, hdev }; u8 status = mgmt_status(err); bt_dev_dbg(hdev, "err %d", err); if (status) { mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, cmd_status_rsp, &status); return; } mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, settings_rsp, &match); new_settings(hdev, match.sk); if (match.sk) sock_put(match.sk); } static int set_le_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_mode *cp = cmd->param; u8 val = !!cp->val; int err; if (!val) { hci_clear_adv_instance_sync(hdev, NULL, 0x00, true); if (hci_dev_test_flag(hdev, HCI_LE_ADV)) hci_disable_advertising_sync(hdev); if (ext_adv_capable(hdev)) hci_remove_ext_adv_instance_sync(hdev, 0, cmd->sk); } else { hci_dev_set_flag(hdev, HCI_LE_ENABLED); } err = hci_write_le_host_supported_sync(hdev, val, 0); /* Make sure the controller has a good default for * advertising data. Restrict the update to when LE * has actually been enabled. During power on, the * update in powered_update_hci will take care of it. */ if (!err && hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { if (ext_adv_capable(hdev)) { int status; status = hci_setup_ext_adv_instance_sync(hdev, 0x00); if (!status) hci_update_scan_rsp_data_sync(hdev, 0x00); } else { hci_update_adv_data_sync(hdev, 0x00); hci_update_scan_rsp_data_sync(hdev, 0x00); } hci_update_passive_scan(hdev); } return err; } static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; u8 status = mgmt_status(err); struct sock *sk = cmd->sk; if (status) { mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, cmd_status_rsp, &status); return; } mgmt_pending_remove(cmd); mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, 0, NULL, 0); } static int set_mesh_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_set_mesh *cp = cmd->param; size_t len = cmd->param_len; memset(hdev->mesh_ad_types, 0, sizeof(hdev->mesh_ad_types)); if (cp->enable) hci_dev_set_flag(hdev, HCI_MESH); else hci_dev_clear_flag(hdev, HCI_MESH); len -= sizeof(*cp); /* If filters don't fit, forward all adv pkts */ if (len <= sizeof(hdev->mesh_ad_types)) memcpy(hdev->mesh_ad_types, cp->ad_types, len); hci_update_passive_scan_sync(hdev); return 0; } static int set_mesh(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_mesh *cp = data; struct mgmt_pending_cmd *cmd; int err = 0; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev) || !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, MGMT_STATUS_NOT_SUPPORTED); if (cp->enable != 0x00 && cp->enable != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); cmd = mgmt_pending_add(sk, MGMT_OP_SET_MESH_RECEIVER, hdev, data, len); if (!cmd) err = -ENOMEM; else err = hci_cmd_sync_queue(hdev, set_mesh_sync, cmd, set_mesh_complete); if (err < 0) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, MGMT_STATUS_FAILED); if (cmd) mgmt_pending_remove(cmd); } hci_dev_unlock(hdev); return err; } static void mesh_send_start_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_mesh_tx *mesh_tx = data; struct mgmt_cp_mesh_send *send = (void *)mesh_tx->param; unsigned long mesh_send_interval; u8 mgmt_err = mgmt_status(err); /* Report any errors here, but don't report completion */ if (mgmt_err) { hci_dev_clear_flag(hdev, HCI_MESH_SENDING); /* Send Complete Error Code for handle */ mesh_send_complete(hdev, mesh_tx, false); return; } mesh_send_interval = msecs_to_jiffies((send->cnt) * 25); queue_delayed_work(hdev->req_workqueue, &hdev->mesh_send_done, mesh_send_interval); } static int mesh_send_sync(struct hci_dev *hdev, void *data) { struct mgmt_mesh_tx *mesh_tx = data; struct mgmt_cp_mesh_send *send = (void *)mesh_tx->param; struct adv_info *adv, *next_instance; u8 instance = hdev->le_num_of_adv_sets + 1; u16 timeout, duration; int err = 0; if (hdev->le_num_of_adv_sets <= hdev->adv_instance_cnt) return MGMT_STATUS_BUSY; timeout = 1000; duration = send->cnt * INTERVAL_TO_MS(hdev->le_adv_max_interval); adv = hci_add_adv_instance(hdev, instance, 0, send->adv_data_len, send->adv_data, 0, NULL, timeout, duration, HCI_ADV_TX_POWER_NO_PREFERENCE, hdev->le_adv_min_interval, hdev->le_adv_max_interval, mesh_tx->handle); if (!IS_ERR(adv)) mesh_tx->instance = instance; else err = PTR_ERR(adv); if (hdev->cur_adv_instance == instance) { /* If the currently advertised instance is being changed then * cancel the current advertising and schedule the next * instance. If there is only one instance then the overridden * advertising data will be visible right away. */ cancel_adv_timeout(hdev); next_instance = hci_get_next_instance(hdev, instance); if (next_instance) instance = next_instance->instance; else instance = 0; } else if (hdev->adv_instance_timeout) { /* Immediately advertise the new instance if no other, or * let it go naturally from queue if ADV is already happening */ instance = 0; } if (instance) return hci_schedule_adv_instance_sync(hdev, instance, true); return err; } static void send_count(struct mgmt_mesh_tx *mesh_tx, void *data) { struct mgmt_rp_mesh_read_features *rp = data; if (rp->used_handles >= rp->max_handles) return; rp->handles[rp->used_handles++] = mesh_tx->handle; } static int mesh_features(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_rp_mesh_read_features rp; if (!lmp_le_capable(hdev) || !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_READ_FEATURES, MGMT_STATUS_NOT_SUPPORTED); memset(&rp, 0, sizeof(rp)); rp.index = cpu_to_le16(hdev->id); if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) rp.max_handles = MESH_HANDLES_MAX; hci_dev_lock(hdev); if (rp.max_handles) mgmt_mesh_foreach(hdev, send_count, &rp, sk); mgmt_cmd_complete(sk, hdev->id, MGMT_OP_MESH_READ_FEATURES, 0, &rp, rp.used_handles + sizeof(rp) - MESH_HANDLES_MAX); hci_dev_unlock(hdev); return 0; } static int send_cancel(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_mesh_send_cancel *cancel = (void *)cmd->param; struct mgmt_mesh_tx *mesh_tx; if (!cancel->handle) { do { mesh_tx = mgmt_mesh_next(hdev, cmd->sk); if (mesh_tx) mesh_send_complete(hdev, mesh_tx, false); } while (mesh_tx); } else { mesh_tx = mgmt_mesh_find(hdev, cancel->handle); if (mesh_tx && mesh_tx->sk == cmd->sk) mesh_send_complete(hdev, mesh_tx, false); } mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL, 0, NULL, 0); mgmt_pending_free(cmd); return 0; } static int mesh_send_cancel(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_pending_cmd *cmd; int err; if (!lmp_le_capable(hdev) || !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL, MGMT_STATUS_NOT_SUPPORTED); if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL, MGMT_STATUS_REJECTED); hci_dev_lock(hdev); cmd = mgmt_pending_new(sk, MGMT_OP_MESH_SEND_CANCEL, hdev, data, len); if (!cmd) err = -ENOMEM; else err = hci_cmd_sync_queue(hdev, send_cancel, cmd, NULL); if (err < 0) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL, MGMT_STATUS_FAILED); if (cmd) mgmt_pending_free(cmd); } hci_dev_unlock(hdev); return err; } static int mesh_send(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mesh_tx *mesh_tx; struct mgmt_cp_mesh_send *send = data; struct mgmt_rp_mesh_read_features rp; bool sending; int err = 0; if (!lmp_le_capable(hdev) || !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND, MGMT_STATUS_NOT_SUPPORTED); if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) || len <= MGMT_MESH_SEND_SIZE || len > (MGMT_MESH_SEND_SIZE + 31)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND, MGMT_STATUS_REJECTED); hci_dev_lock(hdev); memset(&rp, 0, sizeof(rp)); rp.max_handles = MESH_HANDLES_MAX; mgmt_mesh_foreach(hdev, send_count, &rp, sk); if (rp.max_handles <= rp.used_handles) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND, MGMT_STATUS_BUSY); goto done; } sending = hci_dev_test_flag(hdev, HCI_MESH_SENDING); mesh_tx = mgmt_mesh_add(sk, hdev, send, len); if (!mesh_tx) err = -ENOMEM; else if (!sending) err = hci_cmd_sync_queue(hdev, mesh_send_sync, mesh_tx, mesh_send_start_complete); if (err < 0) { bt_dev_err(hdev, "Send Mesh Failed %d", err); err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND, MGMT_STATUS_FAILED); if (mesh_tx) { if (sending) mgmt_mesh_remove(mesh_tx); } } else { hci_dev_set_flag(hdev, HCI_MESH_SENDING); mgmt_cmd_complete(sk, hdev->id, MGMT_OP_MESH_SEND, 0, &mesh_tx->handle, 1); } done: hci_dev_unlock(hdev); return err; } static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; struct mgmt_pending_cmd *cmd; int err; u8 val, enabled; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE, MGMT_STATUS_NOT_SUPPORTED); if (cp->val != 0x00 && cp->val != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE, MGMT_STATUS_INVALID_PARAMS); /* Bluetooth single mode LE only controllers or dual-mode * controllers configured as LE only devices, do not allow * switching LE off. These have either LE enabled explicitly * or BR/EDR has been previously switched off. * * When trying to enable an already enabled LE, then gracefully * send a positive response. Trying to disable it however will * result into rejection. */ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { if (cp->val == 0x01) return send_settings_rsp(sk, MGMT_OP_SET_LE, hdev); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE, MGMT_STATUS_REJECTED); } hci_dev_lock(hdev); val = !!cp->val; enabled = lmp_host_le_capable(hdev); if (!hdev_is_powered(hdev) || val == enabled) { bool changed = false; if (val != hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { hci_dev_change_flag(hdev, HCI_LE_ENABLED); changed = true; } if (!val && hci_dev_test_flag(hdev, HCI_ADVERTISING)) { hci_dev_clear_flag(hdev, HCI_ADVERTISING); changed = true; } err = send_settings_rsp(sk, MGMT_OP_SET_LE, hdev); if (err < 0) goto unlock; if (changed) err = new_settings(hdev, sk); goto unlock; } if (pending_find(MGMT_OP_SET_LE, hdev) || pending_find(MGMT_OP_SET_ADVERTISING, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE, MGMT_STATUS_BUSY); goto unlock; } cmd = mgmt_pending_add(sk, MGMT_OP_SET_LE, hdev, data, len); if (!cmd) err = -ENOMEM; else err = hci_cmd_sync_queue(hdev, set_le_sync, cmd, set_le_complete); if (err < 0) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE, MGMT_STATUS_FAILED); if (cmd) mgmt_pending_remove(cmd); } unlock: hci_dev_unlock(hdev); return err; } static int send_hci_cmd_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_hci_cmd_sync *cp = cmd->param; struct sk_buff *skb; skb = __hci_cmd_sync_ev(hdev, le16_to_cpu(cp->opcode), le16_to_cpu(cp->params_len), cp->params, cp->event, cp->timeout ? secs_to_jiffies(cp->timeout) : HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, mgmt_status(PTR_ERR(skb))); goto done; } mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, 0, skb->data, skb->len); kfree_skb(skb); done: mgmt_pending_free(cmd); return 0; } static int mgmt_hci_cmd_sync(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_hci_cmd_sync *cp = data; struct mgmt_pending_cmd *cmd; int err; if (len < sizeof(*cp)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); cmd = mgmt_pending_new(sk, MGMT_OP_HCI_CMD_SYNC, hdev, data, len); if (!cmd) err = -ENOMEM; else err = hci_cmd_sync_queue(hdev, send_hci_cmd_sync, cmd, NULL); if (err < 0) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, MGMT_STATUS_FAILED); if (cmd) mgmt_pending_free(cmd); } hci_dev_unlock(hdev); return err; } /* This is a helper function to test for pending mgmt commands that can * cause CoD or EIR HCI commands. We can only allow one such pending * mgmt command at a time since otherwise we cannot easily track what * the current values are, will be, and based on that calculate if a new * HCI command needs to be sent and if yes with what value. */ static bool pending_eir_or_class(struct hci_dev *hdev) { struct mgmt_pending_cmd *cmd; list_for_each_entry(cmd, &hdev->mgmt_pending, list) { switch (cmd->opcode) { case MGMT_OP_ADD_UUID: case MGMT_OP_REMOVE_UUID: case MGMT_OP_SET_DEV_CLASS: case MGMT_OP_SET_POWERED: return true; } } return false; } static const u8 bluetooth_base_uuid[] = { 0xfb, 0x34, 0x9b, 0x5f, 0x80, 0x00, 0x00, 0x80, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; static u8 get_uuid_size(const u8 *uuid) { u32 val; if (memcmp(uuid, bluetooth_base_uuid, 12)) return 128; val = get_unaligned_le32(&uuid[12]); if (val > 0xffff) return 32; return 16; } static void mgmt_class_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), hdev->dev_class, 3); mgmt_pending_free(cmd); } static int add_uuid_sync(struct hci_dev *hdev, void *data) { int err; err = hci_update_class_sync(hdev); if (err) return err; return hci_update_eir_sync(hdev); } static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_add_uuid *cp = data; struct mgmt_pending_cmd *cmd; struct bt_uuid *uuid; int err; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); if (pending_eir_or_class(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_UUID, MGMT_STATUS_BUSY); goto failed; } uuid = kmalloc(sizeof(*uuid), GFP_KERNEL); if (!uuid) { err = -ENOMEM; goto failed; } memcpy(uuid->uuid, cp->uuid, 16); uuid->svc_hint = cp->svc_hint; uuid->size = get_uuid_size(cp->uuid); list_add_tail(&uuid->list, &hdev->uuids); cmd = mgmt_pending_new(sk, MGMT_OP_ADD_UUID, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; } /* MGMT_OP_ADD_UUID don't require adapter the UP/Running so use * hci_cmd_sync_submit instead of hci_cmd_sync_queue. */ err = hci_cmd_sync_submit(hdev, add_uuid_sync, cmd, mgmt_class_complete); if (err < 0) { mgmt_pending_free(cmd); goto failed; } failed: hci_dev_unlock(hdev); return err; } static bool enable_service_cache(struct hci_dev *hdev) { if (!hdev_is_powered(hdev)) return false; if (!hci_dev_test_and_set_flag(hdev, HCI_SERVICE_CACHE)) { queue_delayed_work(hdev->workqueue, &hdev->service_cache, CACHE_TIMEOUT); return true; } return false; } static int remove_uuid_sync(struct hci_dev *hdev, void *data) { int err; err = hci_update_class_sync(hdev); if (err) return err; return hci_update_eir_sync(hdev); } static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_remove_uuid *cp = data; struct mgmt_pending_cmd *cmd; struct bt_uuid *match, *tmp; static const u8 bt_uuid_any[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int err, found; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); if (pending_eir_or_class(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_UUID, MGMT_STATUS_BUSY); goto unlock; } if (memcmp(cp->uuid, bt_uuid_any, 16) == 0) { hci_uuids_clear(hdev); if (enable_service_cache(hdev)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_UUID, 0, hdev->dev_class, 3); goto unlock; } goto update_class; } found = 0; list_for_each_entry_safe(match, tmp, &hdev->uuids, list) { if (memcmp(match->uuid, cp->uuid, 16) != 0) continue; list_del(&match->list); kfree(match); found++; } if (found == 0) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_UUID, MGMT_STATUS_INVALID_PARAMS); goto unlock; } update_class: cmd = mgmt_pending_new(sk, MGMT_OP_REMOVE_UUID, hdev, data, len); if (!cmd) { err = -ENOMEM; goto unlock; } /* MGMT_OP_REMOVE_UUID don't require adapter the UP/Running so use * hci_cmd_sync_submit instead of hci_cmd_sync_queue. */ err = hci_cmd_sync_submit(hdev, remove_uuid_sync, cmd, mgmt_class_complete); if (err < 0) mgmt_pending_free(cmd); unlock: hci_dev_unlock(hdev); return err; } static int set_class_sync(struct hci_dev *hdev, void *data) { int err = 0; if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) { cancel_delayed_work_sync(&hdev->service_cache); err = hci_update_eir_sync(hdev); } if (err) return err; return hci_update_class_sync(hdev); } static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_dev_class *cp = data; struct mgmt_pending_cmd *cmd; int err; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_bredr_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, MGMT_STATUS_NOT_SUPPORTED); hci_dev_lock(hdev); if (pending_eir_or_class(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, MGMT_STATUS_BUSY); goto unlock; } if ((cp->minor & 0x03) != 0 || (cp->major & 0xe0) != 0) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, MGMT_STATUS_INVALID_PARAMS); goto unlock; } hdev->major_class = cp->major; hdev->minor_class = cp->minor; if (!hdev_is_powered(hdev)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, 0, hdev->dev_class, 3); goto unlock; } cmd = mgmt_pending_new(sk, MGMT_OP_SET_DEV_CLASS, hdev, data, len); if (!cmd) { err = -ENOMEM; goto unlock; } /* MGMT_OP_SET_DEV_CLASS don't require adapter the UP/Running so use * hci_cmd_sync_submit instead of hci_cmd_sync_queue. */ err = hci_cmd_sync_submit(hdev, set_class_sync, cmd, mgmt_class_complete); if (err < 0) mgmt_pending_free(cmd); unlock: hci_dev_unlock(hdev); return err; } static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_load_link_keys *cp = data; const u16 max_key_count = ((U16_MAX - sizeof(*cp)) / sizeof(struct mgmt_link_key_info)); u16 key_count, expected_len; bool changed; int i; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_bredr_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, MGMT_STATUS_NOT_SUPPORTED); key_count = __le16_to_cpu(cp->key_count); if (key_count > max_key_count) { bt_dev_err(hdev, "load_link_keys: too big key_count value %u", key_count); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, MGMT_STATUS_INVALID_PARAMS); } expected_len = struct_size(cp, keys, key_count); if (expected_len != len) { bt_dev_err(hdev, "load_link_keys: expected %u bytes, got %u bytes", expected_len, len); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, MGMT_STATUS_INVALID_PARAMS); } if (cp->debug_keys != 0x00 && cp->debug_keys != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, MGMT_STATUS_INVALID_PARAMS); bt_dev_dbg(hdev, "debug_keys %u key_count %u", cp->debug_keys, key_count); hci_dev_lock(hdev); hci_link_keys_clear(hdev); if (cp->debug_keys) changed = !hci_dev_test_and_set_flag(hdev, HCI_KEEP_DEBUG_KEYS); else changed = hci_dev_test_and_clear_flag(hdev, HCI_KEEP_DEBUG_KEYS); if (changed) new_settings(hdev, NULL); for (i = 0; i < key_count; i++) { struct mgmt_link_key_info *key = &cp->keys[i]; if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LINKKEY, key->val)) { bt_dev_warn(hdev, "Skipping blocked link key for %pMR", &key->addr.bdaddr); continue; } if (key->addr.type != BDADDR_BREDR) { bt_dev_warn(hdev, "Invalid link address type %u for %pMR", key->addr.type, &key->addr.bdaddr); continue; } if (key->type > 0x08) { bt_dev_warn(hdev, "Invalid link key type %u for %pMR", key->type, &key->addr.bdaddr); continue; } /* Always ignore debug keys and require a new pairing if * the user wants to use them. */ if (key->type == HCI_LK_DEBUG_COMBINATION) continue; hci_add_link_key(hdev, NULL, &key->addr.bdaddr, key->val, key->type, key->pin_len, NULL); } mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, 0, NULL, 0); hci_dev_unlock(hdev); return 0; } static int device_unpaired(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, struct sock *skip_sk) { struct mgmt_ev_device_unpaired ev; bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = addr_type; return mgmt_event(MGMT_EV_DEVICE_UNPAIRED, hdev, &ev, sizeof(ev), skip_sk); } static void unpair_device_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_unpair_device *cp = cmd->param; if (!err) device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, cmd->sk); cmd->cmd_complete(cmd, err); mgmt_pending_free(cmd); } static int unpair_device_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_unpair_device *cp = cmd->param; struct hci_conn *conn; if (cp->addr.type == BDADDR_BREDR) conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr); else conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr, le_addr_type(cp->addr.type)); if (!conn) return 0; /* Disregard any possible error since the likes of hci_abort_conn_sync * will clean up the connection no matter the error. */ hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); return 0; } static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_unpair_device *cp = data; struct mgmt_rp_unpair_device rp; struct hci_conn_params *params; struct mgmt_pending_cmd *cmd; struct hci_conn *conn; u8 addr_type; int err; memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); rp.addr.type = cp->addr.type; if (!bdaddr_type_is_valid(cp->addr.type)) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, MGMT_STATUS_INVALID_PARAMS, &rp, sizeof(rp)); if (cp->disconnect != 0x00 && cp->disconnect != 0x01) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, MGMT_STATUS_INVALID_PARAMS, &rp, sizeof(rp)); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); goto unlock; } if (cp->addr.type == BDADDR_BREDR) { /* If disconnection is requested, then look up the * connection. If the remote device is connected, it * will be later used to terminate the link. * * Setting it to NULL explicitly will cause no * termination of the link. */ if (cp->disconnect) conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr); else conn = NULL; err = hci_remove_link_key(hdev, &cp->addr.bdaddr); if (err < 0) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, MGMT_STATUS_NOT_PAIRED, &rp, sizeof(rp)); goto unlock; } goto done; } /* LE address type */ addr_type = le_addr_type(cp->addr.type); /* Abort any ongoing SMP pairing. Removes ltk and irk if they exist. */ err = smp_cancel_and_remove_pairing(hdev, &cp->addr.bdaddr, addr_type); if (err < 0) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, MGMT_STATUS_NOT_PAIRED, &rp, sizeof(rp)); goto unlock; } conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr, addr_type); if (!conn) { hci_conn_params_del(hdev, &cp->addr.bdaddr, addr_type); goto done; } /* Defer clearing up the connection parameters until closing to * give a chance of keeping them if a repairing happens. */ set_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags); /* Disable auto-connection parameters if present */ params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, addr_type); if (params) { if (params->explicit_connect) params->auto_connect = HCI_AUTO_CONN_EXPLICIT; else params->auto_connect = HCI_AUTO_CONN_DISABLED; } /* If disconnection is not requested, then clear the connection * variable so that the link is not terminated. */ if (!cp->disconnect) conn = NULL; done: /* If the connection variable is set, then termination of the * link is requested. */ if (!conn) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNPAIR_DEVICE, 0, &rp, sizeof(rp)); device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, sk); goto unlock; } cmd = mgmt_pending_new(sk, MGMT_OP_UNPAIR_DEVICE, hdev, cp, sizeof(*cp)); if (!cmd) { err = -ENOMEM; goto unlock; } cmd->cmd_complete = addr_cmd_complete; err = hci_cmd_sync_queue(hdev, unpair_device_sync, cmd, unpair_device_complete); if (err < 0) mgmt_pending_free(cmd); unlock: hci_dev_unlock(hdev); return err; } static void disconnect_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; cmd->cmd_complete(cmd, mgmt_status(err)); mgmt_pending_free(cmd); } static int disconnect_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_disconnect *cp = cmd->param; struct hci_conn *conn; if (cp->addr.type == BDADDR_BREDR) conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr); else conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr, le_addr_type(cp->addr.type)); if (!conn) return -ENOTCONN; /* Disregard any possible error since the likes of hci_abort_conn_sync * will clean up the connection no matter the error. */ hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); return 0; } static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_disconnect *cp = data; struct mgmt_rp_disconnect rp; struct mgmt_pending_cmd *cmd; int err; bt_dev_dbg(hdev, "sock %p", sk); memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); rp.addr.type = cp->addr.type; if (!bdaddr_type_is_valid(cp->addr.type)) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, MGMT_STATUS_INVALID_PARAMS, &rp, sizeof(rp)); hci_dev_lock(hdev); if (!test_bit(HCI_UP, &hdev->flags)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); goto failed; } cmd = mgmt_pending_new(sk, MGMT_OP_DISCONNECT, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; } cmd->cmd_complete = generic_cmd_complete; err = hci_cmd_sync_queue(hdev, disconnect_sync, cmd, disconnect_complete); if (err < 0) mgmt_pending_free(cmd); failed: hci_dev_unlock(hdev); return err; } static u8 link_to_bdaddr(u8 link_type, u8 addr_type) { switch (link_type) { case ISO_LINK: case LE_LINK: switch (addr_type) { case ADDR_LE_DEV_PUBLIC: return BDADDR_LE_PUBLIC; default: /* Fallback to LE Random address type */ return BDADDR_LE_RANDOM; } default: /* Fallback to BR/EDR type */ return BDADDR_BREDR; } } static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_rp_get_connections *rp; struct hci_conn *c; int err; u16 i; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_CONNECTIONS, MGMT_STATUS_NOT_POWERED); goto unlock; } i = 0; list_for_each_entry(c, &hdev->conn_hash.list, list) { if (test_bit(HCI_CONN_MGMT_CONNECTED, &c->flags)) i++; } rp = kmalloc(struct_size(rp, addr, i), GFP_KERNEL); if (!rp) { err = -ENOMEM; goto unlock; } i = 0; list_for_each_entry(c, &hdev->conn_hash.list, list) { if (!test_bit(HCI_CONN_MGMT_CONNECTED, &c->flags)) continue; bacpy(&rp->addr[i].bdaddr, &c->dst); rp->addr[i].type = link_to_bdaddr(c->type, c->dst_type); if (c->type == SCO_LINK || c->type == ESCO_LINK) continue; i++; } rp->conn_count = cpu_to_le16(i); /* Recalculate length in case of filtered SCO connections, etc */ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONNECTIONS, 0, rp, struct_size(rp, addr, i)); kfree(rp); unlock: hci_dev_unlock(hdev); return err; } static int send_pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_pin_code_neg_reply *cp) { struct mgmt_pending_cmd *cmd; int err; cmd = mgmt_pending_add(sk, MGMT_OP_PIN_CODE_NEG_REPLY, hdev, cp, sizeof(*cp)); if (!cmd) return -ENOMEM; cmd->cmd_complete = addr_cmd_complete; err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY, sizeof(cp->addr.bdaddr), &cp->addr.bdaddr); if (err < 0) mgmt_pending_remove(cmd); return err; } static int pin_code_reply(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct hci_conn *conn; struct mgmt_cp_pin_code_reply *cp = data; struct hci_cp_pin_code_reply reply; struct mgmt_pending_cmd *cmd; int err; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY, MGMT_STATUS_NOT_POWERED); goto failed; } conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr); if (!conn) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY, MGMT_STATUS_NOT_CONNECTED); goto failed; } if (conn->pending_sec_level == BT_SECURITY_HIGH && cp->pin_len != 16) { struct mgmt_cp_pin_code_neg_reply ncp; memcpy(&ncp.addr, &cp->addr, sizeof(ncp.addr)); bt_dev_err(hdev, "PIN code is not 16 bytes long"); err = send_pin_code_neg_reply(sk, hdev, &ncp); if (err >= 0) err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_PIN_CODE_REPLY, MGMT_STATUS_INVALID_PARAMS); goto failed; } cmd = mgmt_pending_add(sk, MGMT_OP_PIN_CODE_REPLY, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; } cmd->cmd_complete = addr_cmd_complete; bacpy(&reply.bdaddr, &cp->addr.bdaddr); reply.pin_len = cp->pin_len; memcpy(reply.pin_code, cp->pin_code, sizeof(reply.pin_code)); err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_REPLY, sizeof(reply), &reply); if (err < 0) mgmt_pending_remove(cmd); failed: hci_dev_unlock(hdev); return err; } static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_io_capability *cp = data; bt_dev_dbg(hdev, "sock %p", sk); if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); hdev->io_capability = cp->io_capability; bt_dev_dbg(hdev, "IO capability set to 0x%02x", hdev->io_capability); hci_dev_unlock(hdev); return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, 0, NULL, 0); } static struct mgmt_pending_cmd *find_pairing(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; struct mgmt_pending_cmd *cmd; list_for_each_entry(cmd, &hdev->mgmt_pending, list) { if (cmd->opcode != MGMT_OP_PAIR_DEVICE) continue; if (cmd->user_data != conn) continue; return cmd; } return NULL; } static int pairing_complete(struct mgmt_pending_cmd *cmd, u8 status) { struct mgmt_rp_pair_device rp; struct hci_conn *conn = cmd->user_data; int err; bacpy(&rp.addr.bdaddr, &conn->dst); rp.addr.type = link_to_bdaddr(conn->type, conn->dst_type); err = mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_PAIR_DEVICE, status, &rp, sizeof(rp)); /* So we don't get further callbacks for this connection */ conn->connect_cfm_cb = NULL; conn->security_cfm_cb = NULL; conn->disconn_cfm_cb = NULL; hci_conn_drop(conn); /* The device is paired so there is no need to remove * its connection parameters anymore. */ clear_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags); hci_conn_put(conn); return err; } void mgmt_smp_complete(struct hci_conn *conn, bool complete) { u8 status = complete ? MGMT_STATUS_SUCCESS : MGMT_STATUS_FAILED; struct mgmt_pending_cmd *cmd; cmd = find_pairing(conn); if (cmd) { cmd->cmd_complete(cmd, status); mgmt_pending_remove(cmd); } } static void pairing_complete_cb(struct hci_conn *conn, u8 status) { struct mgmt_pending_cmd *cmd; BT_DBG("status %u", status); cmd = find_pairing(conn); if (!cmd) { BT_DBG("Unable to find a pending command"); return; } cmd->cmd_complete(cmd, mgmt_status(status)); mgmt_pending_remove(cmd); } static void le_pairing_complete_cb(struct hci_conn *conn, u8 status) { struct mgmt_pending_cmd *cmd; BT_DBG("status %u", status); if (!status) return; cmd = find_pairing(conn); if (!cmd) { BT_DBG("Unable to find a pending command"); return; } cmd->cmd_complete(cmd, mgmt_status(status)); mgmt_pending_remove(cmd); } static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_pair_device *cp = data; struct mgmt_rp_pair_device rp; struct mgmt_pending_cmd *cmd; u8 sec_level, auth_type; struct hci_conn *conn; int err; bt_dev_dbg(hdev, "sock %p", sk); memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); rp.addr.type = cp->addr.type; if (!bdaddr_type_is_valid(cp->addr.type)) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, MGMT_STATUS_INVALID_PARAMS, &rp, sizeof(rp)); if (cp->io_cap > SMP_IO_KEYBOARD_DISPLAY) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, MGMT_STATUS_INVALID_PARAMS, &rp, sizeof(rp)); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); goto unlock; } if (hci_bdaddr_is_paired(hdev, &cp->addr.bdaddr, cp->addr.type)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, MGMT_STATUS_ALREADY_PAIRED, &rp, sizeof(rp)); goto unlock; } sec_level = BT_SECURITY_MEDIUM; auth_type = HCI_AT_DEDICATED_BONDING; if (cp->addr.type == BDADDR_BREDR) { conn = hci_connect_acl(hdev, &cp->addr.bdaddr, sec_level, auth_type, CONN_REASON_PAIR_DEVICE, HCI_ACL_CONN_TIMEOUT); } else { u8 addr_type = le_addr_type(cp->addr.type); struct hci_conn_params *p; /* When pairing a new device, it is expected to remember * this device for future connections. Adding the connection * parameter information ahead of time allows tracking * of the peripheral preferred values and will speed up any * further connection establishment. * * If connection parameters already exist, then they * will be kept and this function does nothing. */ p = hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type); if (!p) { err = -EIO; goto unlock; } if (p->auto_connect == HCI_AUTO_CONN_EXPLICIT) p->auto_connect = HCI_AUTO_CONN_DISABLED; conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr, addr_type, sec_level, HCI_LE_CONN_TIMEOUT, CONN_REASON_PAIR_DEVICE); } if (IS_ERR(conn)) { int status; if (PTR_ERR(conn) == -EBUSY) status = MGMT_STATUS_BUSY; else if (PTR_ERR(conn) == -EOPNOTSUPP) status = MGMT_STATUS_NOT_SUPPORTED; else if (PTR_ERR(conn) == -ECONNREFUSED) status = MGMT_STATUS_REJECTED; else status = MGMT_STATUS_CONNECT_FAILED; err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, status, &rp, sizeof(rp)); goto unlock; } if (conn->connect_cfm_cb) { hci_conn_drop(conn); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_PAIR_DEVICE, MGMT_STATUS_BUSY, &rp, sizeof(rp)); goto unlock; } cmd = mgmt_pending_add(sk, MGMT_OP_PAIR_DEVICE, hdev, data, len); if (!cmd) { err = -ENOMEM; hci_conn_drop(conn); goto unlock; } cmd->cmd_complete = pairing_complete; /* For LE, just connecting isn't a proof that the pairing finished */ if (cp->addr.type == BDADDR_BREDR) { conn->connect_cfm_cb = pairing_complete_cb; conn->security_cfm_cb = pairing_complete_cb; conn->disconn_cfm_cb = pairing_complete_cb; } else { conn->connect_cfm_cb = le_pairing_complete_cb; conn->security_cfm_cb = le_pairing_complete_cb; conn->disconn_cfm_cb = le_pairing_complete_cb; } conn->io_capability = cp->io_cap; cmd->user_data = hci_conn_get(conn); if ((conn->state == BT_CONNECTED || conn->state == BT_CONFIG) && hci_conn_security(conn, sec_level, auth_type, true)) { cmd->cmd_complete(cmd, 0); mgmt_pending_remove(cmd); } err = 0; unlock: hci_dev_unlock(hdev); return err; } static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_addr_info *addr = data; struct mgmt_pending_cmd *cmd; struct hci_conn *conn; int err; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, MGMT_STATUS_NOT_POWERED); goto unlock; } cmd = pending_find(MGMT_OP_PAIR_DEVICE, hdev); if (!cmd) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, MGMT_STATUS_INVALID_PARAMS); goto unlock; } conn = cmd->user_data; if (bacmp(&addr->bdaddr, &conn->dst) != 0) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, MGMT_STATUS_INVALID_PARAMS); goto unlock; } cmd->cmd_complete(cmd, MGMT_STATUS_CANCELLED); mgmt_pending_remove(cmd); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, 0, addr, sizeof(*addr)); /* Since user doesn't want to proceed with the connection, abort any * ongoing pairing and then terminate the link if it was created * because of the pair device action. */ if (addr->type == BDADDR_BREDR) hci_remove_link_key(hdev, &addr->bdaddr); else smp_cancel_and_remove_pairing(hdev, &addr->bdaddr, le_addr_type(addr->type)); if (conn->conn_reason == CONN_REASON_PAIR_DEVICE) hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); unlock: hci_dev_unlock(hdev); return err; } static int user_pairing_resp(struct sock *sk, struct hci_dev *hdev, struct mgmt_addr_info *addr, u16 mgmt_op, u16 hci_op, __le32 passkey) { struct mgmt_pending_cmd *cmd; struct hci_conn *conn; int err; hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { err = mgmt_cmd_complete(sk, hdev->id, mgmt_op, MGMT_STATUS_NOT_POWERED, addr, sizeof(*addr)); goto done; } if (addr->type == BDADDR_BREDR) conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &addr->bdaddr); else conn = hci_conn_hash_lookup_le(hdev, &addr->bdaddr, le_addr_type(addr->type)); if (!conn) { err = mgmt_cmd_complete(sk, hdev->id, mgmt_op, MGMT_STATUS_NOT_CONNECTED, addr, sizeof(*addr)); goto done; } if (addr->type == BDADDR_LE_PUBLIC || addr->type == BDADDR_LE_RANDOM) { err = smp_user_confirm_reply(conn, mgmt_op, passkey); if (!err) err = mgmt_cmd_complete(sk, hdev->id, mgmt_op, MGMT_STATUS_SUCCESS, addr, sizeof(*addr)); else err = mgmt_cmd_complete(sk, hdev->id, mgmt_op, MGMT_STATUS_FAILED, addr, sizeof(*addr)); goto done; } cmd = mgmt_pending_add(sk, mgmt_op, hdev, addr, sizeof(*addr)); if (!cmd) { err = -ENOMEM; goto done; } cmd->cmd_complete = addr_cmd_complete; /* Continue with pairing via HCI */ if (hci_op == HCI_OP_USER_PASSKEY_REPLY) { struct hci_cp_user_passkey_reply cp; bacpy(&cp.bdaddr, &addr->bdaddr); cp.passkey = passkey; err = hci_send_cmd(hdev, hci_op, sizeof(cp), &cp); } else err = hci_send_cmd(hdev, hci_op, sizeof(addr->bdaddr), &addr->bdaddr); if (err < 0) mgmt_pending_remove(cmd); done: hci_dev_unlock(hdev); return err; } static int pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_pin_code_neg_reply *cp = data; bt_dev_dbg(hdev, "sock %p", sk); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_PIN_CODE_NEG_REPLY, HCI_OP_PIN_CODE_NEG_REPLY, 0); } static int user_confirm_reply(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_user_confirm_reply *cp = data; bt_dev_dbg(hdev, "sock %p", sk); if (len != sizeof(*cp)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_USER_CONFIRM_REPLY, MGMT_STATUS_INVALID_PARAMS); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_CONFIRM_REPLY, HCI_OP_USER_CONFIRM_REPLY, 0); } static int user_confirm_neg_reply(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_user_confirm_neg_reply *cp = data; bt_dev_dbg(hdev, "sock %p", sk); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_CONFIRM_NEG_REPLY, HCI_OP_USER_CONFIRM_NEG_REPLY, 0); } static int user_passkey_reply(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_user_passkey_reply *cp = data; bt_dev_dbg(hdev, "sock %p", sk); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_PASSKEY_REPLY, HCI_OP_USER_PASSKEY_REPLY, cp->passkey); } static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_user_passkey_neg_reply *cp = data; bt_dev_dbg(hdev, "sock %p", sk); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_PASSKEY_NEG_REPLY, HCI_OP_USER_PASSKEY_NEG_REPLY, 0); } static int adv_expire_sync(struct hci_dev *hdev, u32 flags) { struct adv_info *adv_instance; adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance); if (!adv_instance) return 0; /* stop if current instance doesn't need to be changed */ if (!(adv_instance->flags & flags)) return 0; cancel_adv_timeout(hdev); adv_instance = hci_get_next_instance(hdev, adv_instance->instance); if (!adv_instance) return 0; hci_schedule_adv_instance_sync(hdev, adv_instance->instance, true); return 0; } static int name_changed_sync(struct hci_dev *hdev, void *data) { return adv_expire_sync(hdev, MGMT_ADV_FLAG_LOCAL_NAME); } static void set_name_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_set_local_name *cp = cmd->param; u8 status = mgmt_status(err); bt_dev_dbg(hdev, "err %d", err); if (err == -ECANCELED || cmd != pending_find(MGMT_OP_SET_LOCAL_NAME, hdev)) return; if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, status); } else { mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, cp, sizeof(*cp)); if (hci_dev_test_flag(hdev, HCI_LE_ADV)) hci_cmd_sync_queue(hdev, name_changed_sync, NULL, NULL); } mgmt_pending_remove(cmd); } static int set_name_sync(struct hci_dev *hdev, void *data) { if (lmp_bredr_capable(hdev)) { hci_update_name_sync(hdev); hci_update_eir_sync(hdev); } /* The name is stored in the scan response data and so * no need to update the advertising data here. */ if (lmp_le_capable(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING)) hci_update_scan_rsp_data_sync(hdev, hdev->cur_adv_instance); return 0; } static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_local_name *cp = data; struct mgmt_pending_cmd *cmd; int err; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); /* If the old values are the same as the new ones just return a * direct command complete event. */ if (!memcmp(hdev->dev_name, cp->name, sizeof(hdev->dev_name)) && !memcmp(hdev->short_name, cp->short_name, sizeof(hdev->short_name))) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, data, len); goto failed; } memcpy(hdev->short_name, cp->short_name, sizeof(hdev->short_name)); if (!hdev_is_powered(hdev)) { memcpy(hdev->dev_name, cp->name, sizeof(hdev->dev_name)); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, data, len); if (err < 0) goto failed; err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data, len, HCI_MGMT_LOCAL_NAME_EVENTS, sk); ext_info_changed(hdev, sk); goto failed; } cmd = mgmt_pending_add(sk, MGMT_OP_SET_LOCAL_NAME, hdev, data, len); if (!cmd) err = -ENOMEM; else err = hci_cmd_sync_queue(hdev, set_name_sync, cmd, set_name_complete); if (err < 0) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, MGMT_STATUS_FAILED); if (cmd) mgmt_pending_remove(cmd); goto failed; } memcpy(hdev->dev_name, cp->name, sizeof(hdev->dev_name)); failed: hci_dev_unlock(hdev); return err; } static int appearance_changed_sync(struct hci_dev *hdev, void *data) { return adv_expire_sync(hdev, MGMT_ADV_FLAG_APPEARANCE); } static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_appearance *cp = data; u16 appearance; int err; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE, MGMT_STATUS_NOT_SUPPORTED); appearance = le16_to_cpu(cp->appearance); hci_dev_lock(hdev); if (hdev->appearance != appearance) { hdev->appearance = appearance; if (hci_dev_test_flag(hdev, HCI_LE_ADV)) hci_cmd_sync_queue(hdev, appearance_changed_sync, NULL, NULL); ext_info_changed(hdev, sk); } err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_APPEARANCE, 0, NULL, 0); hci_dev_unlock(hdev); return err; } static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_rp_get_phy_configuration rp; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); memset(&rp, 0, sizeof(rp)); rp.supported_phys = cpu_to_le32(get_supported_phys(hdev)); rp.selected_phys = cpu_to_le32(get_selected_phys(hdev)); rp.configurable_phys = cpu_to_le32(get_configurable_phys(hdev)); hci_dev_unlock(hdev); return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_PHY_CONFIGURATION, 0, &rp, sizeof(rp)); } int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip) { struct mgmt_ev_phy_configuration_changed ev; memset(&ev, 0, sizeof(ev)); ev.selected_phys = cpu_to_le32(get_selected_phys(hdev)); return mgmt_event(MGMT_EV_PHY_CONFIGURATION_CHANGED, hdev, &ev, sizeof(ev), skip); } static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; struct sk_buff *skb = cmd->skb; u8 status = mgmt_status(err); if (err == -ECANCELED || cmd != pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) return; if (!status) { if (!skb) status = MGMT_STATUS_FAILED; else if (IS_ERR(skb)) status = mgmt_status(PTR_ERR(skb)); else status = mgmt_status(skb->data[0]); } bt_dev_dbg(hdev, "status %d", status); if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, status); } else { mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, 0, NULL, 0); mgmt_phy_configuration_changed(hdev, cmd->sk); } if (skb && !IS_ERR(skb)) kfree_skb(skb); mgmt_pending_remove(cmd); } static int set_default_phy_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_set_phy_configuration *cp = cmd->param; struct hci_cp_le_set_default_phy cp_phy; u32 selected_phys = __le32_to_cpu(cp->selected_phys); memset(&cp_phy, 0, sizeof(cp_phy)); if (!(selected_phys & MGMT_PHY_LE_TX_MASK)) cp_phy.all_phys |= 0x01; if (!(selected_phys & MGMT_PHY_LE_RX_MASK)) cp_phy.all_phys |= 0x02; if (selected_phys & MGMT_PHY_LE_1M_TX) cp_phy.tx_phys |= HCI_LE_SET_PHY_1M; if (selected_phys & MGMT_PHY_LE_2M_TX) cp_phy.tx_phys |= HCI_LE_SET_PHY_2M; if (selected_phys & MGMT_PHY_LE_CODED_TX) cp_phy.tx_phys |= HCI_LE_SET_PHY_CODED; if (selected_phys & MGMT_PHY_LE_1M_RX) cp_phy.rx_phys |= HCI_LE_SET_PHY_1M; if (selected_phys & MGMT_PHY_LE_2M_RX) cp_phy.rx_phys |= HCI_LE_SET_PHY_2M; if (selected_phys & MGMT_PHY_LE_CODED_RX) cp_phy.rx_phys |= HCI_LE_SET_PHY_CODED; cmd->skb = __hci_cmd_sync(hdev, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp_phy), &cp_phy, HCI_CMD_TIMEOUT); return 0; } static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_phy_configuration *cp = data; struct mgmt_pending_cmd *cmd; u32 selected_phys, configurable_phys, supported_phys, unconfigure_phys; u16 pkt_type = (HCI_DH1 | HCI_DM1); bool changed = false; int err; bt_dev_dbg(hdev, "sock %p", sk); configurable_phys = get_configurable_phys(hdev); supported_phys = get_supported_phys(hdev); selected_phys = __le32_to_cpu(cp->selected_phys); if (selected_phys & ~supported_phys) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, MGMT_STATUS_INVALID_PARAMS); unconfigure_phys = supported_phys & ~configurable_phys; if ((selected_phys & unconfigure_phys) != unconfigure_phys) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, MGMT_STATUS_INVALID_PARAMS); if (selected_phys == get_selected_phys(hdev)) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, 0, NULL, 0); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, MGMT_STATUS_REJECTED); goto unlock; } if (pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, MGMT_STATUS_BUSY); goto unlock; } if (selected_phys & MGMT_PHY_BR_1M_3SLOT) pkt_type |= (HCI_DH3 | HCI_DM3); else pkt_type &= ~(HCI_DH3 | HCI_DM3); if (selected_phys & MGMT_PHY_BR_1M_5SLOT) pkt_type |= (HCI_DH5 | HCI_DM5); else pkt_type &= ~(HCI_DH5 | HCI_DM5); if (selected_phys & MGMT_PHY_EDR_2M_1SLOT) pkt_type &= ~HCI_2DH1; else pkt_type |= HCI_2DH1; if (selected_phys & MGMT_PHY_EDR_2M_3SLOT) pkt_type &= ~HCI_2DH3; else pkt_type |= HCI_2DH3; if (selected_phys & MGMT_PHY_EDR_2M_5SLOT) pkt_type &= ~HCI_2DH5; else pkt_type |= HCI_2DH5; if (selected_phys & MGMT_PHY_EDR_3M_1SLOT) pkt_type &= ~HCI_3DH1; else pkt_type |= HCI_3DH1; if (selected_phys & MGMT_PHY_EDR_3M_3SLOT) pkt_type &= ~HCI_3DH3; else pkt_type |= HCI_3DH3; if (selected_phys & MGMT_PHY_EDR_3M_5SLOT) pkt_type &= ~HCI_3DH5; else pkt_type |= HCI_3DH5; if (pkt_type != hdev->pkt_type) { hdev->pkt_type = pkt_type; changed = true; } if ((selected_phys & MGMT_PHY_LE_MASK) == (get_selected_phys(hdev) & MGMT_PHY_LE_MASK)) { if (changed) mgmt_phy_configuration_changed(hdev, sk); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, 0, NULL, 0); goto unlock; } cmd = mgmt_pending_add(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data, len); if (!cmd) err = -ENOMEM; else err = hci_cmd_sync_queue(hdev, set_default_phy_sync, cmd, set_default_phy_complete); if (err < 0) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PHY_CONFIGURATION, MGMT_STATUS_FAILED); if (cmd) mgmt_pending_remove(cmd); } unlock: hci_dev_unlock(hdev); return err; } static int set_blocked_keys(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { int err = MGMT_STATUS_SUCCESS; struct mgmt_cp_set_blocked_keys *keys = data; const u16 max_key_count = ((U16_MAX - sizeof(*keys)) / sizeof(struct mgmt_blocked_key_info)); u16 key_count, expected_len; int i; bt_dev_dbg(hdev, "sock %p", sk); key_count = __le16_to_cpu(keys->key_count); if (key_count > max_key_count) { bt_dev_err(hdev, "too big key_count value %u", key_count); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS, MGMT_STATUS_INVALID_PARAMS); } expected_len = struct_size(keys, keys, key_count); if (expected_len != len) { bt_dev_err(hdev, "expected %u bytes, got %u bytes", expected_len, len); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS, MGMT_STATUS_INVALID_PARAMS); } hci_dev_lock(hdev); hci_blocked_keys_clear(hdev); for (i = 0; i < key_count; ++i) { struct blocked_key *b = kzalloc(sizeof(*b), GFP_KERNEL); if (!b) { err = MGMT_STATUS_NO_RESOURCES; break; } b->type = keys->keys[i].type; memcpy(b->val, keys->keys[i].val, sizeof(b->val)); list_add_rcu(&b->list, &hdev->blocked_keys); } hci_dev_unlock(hdev); return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS, err, NULL, 0); } static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; int err; bool changed = false; bt_dev_dbg(hdev, "sock %p", sk); if (!test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_WIDEBAND_SPEECH, MGMT_STATUS_NOT_SUPPORTED); if (cp->val != 0x00 && cp->val != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_WIDEBAND_SPEECH, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (hdev_is_powered(hdev) && !!cp->val != hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_WIDEBAND_SPEECH, MGMT_STATUS_REJECTED); goto unlock; } if (cp->val) changed = !hci_dev_test_and_set_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED); else changed = hci_dev_test_and_clear_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED); err = send_settings_rsp(sk, MGMT_OP_SET_WIDEBAND_SPEECH, hdev); if (err < 0) goto unlock; if (changed) err = new_settings(hdev, sk); unlock: hci_dev_unlock(hdev); return err; } static int read_controller_cap(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { char buf[20]; struct mgmt_rp_read_controller_cap *rp = (void *)buf; u16 cap_len = 0; u8 flags = 0; u8 tx_power_range[2]; bt_dev_dbg(hdev, "sock %p", sk); memset(&buf, 0, sizeof(buf)); hci_dev_lock(hdev); /* When the Read Simple Pairing Options command is supported, then * the remote public key validation is supported. * * Alternatively, when Microsoft extensions are available, they can * indicate support for public key validation as well. */ if ((hdev->commands[41] & 0x08) || msft_curve_validity(hdev)) flags |= 0x01; /* Remote public key validation (BR/EDR) */ flags |= 0x02; /* Remote public key validation (LE) */ /* When the Read Encryption Key Size command is supported, then the * encryption key size is enforced. */ if (hdev->commands[20] & 0x10) flags |= 0x04; /* Encryption key size enforcement (BR/EDR) */ flags |= 0x08; /* Encryption key size enforcement (LE) */ cap_len = eir_append_data(rp->cap, cap_len, MGMT_CAP_SEC_FLAGS, &flags, 1); /* When the Read Simple Pairing Options command is supported, then * also max encryption key size information is provided. */ if (hdev->commands[41] & 0x08) cap_len = eir_append_le16(rp->cap, cap_len, MGMT_CAP_MAX_ENC_KEY_SIZE, hdev->max_enc_key_size); cap_len = eir_append_le16(rp->cap, cap_len, MGMT_CAP_SMP_MAX_ENC_KEY_SIZE, SMP_MAX_ENC_KEY_SIZE); /* Append the min/max LE tx power parameters if we were able to fetch * it from the controller */ if (hdev->commands[38] & 0x80) { memcpy(&tx_power_range[0], &hdev->min_le_tx_power, 1); memcpy(&tx_power_range[1], &hdev->max_le_tx_power, 1); cap_len = eir_append_data(rp->cap, cap_len, MGMT_CAP_LE_TX_PWR, tx_power_range, 2); } rp->cap_len = cpu_to_le16(cap_len); hci_dev_unlock(hdev); return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_CONTROLLER_CAP, 0, rp, sizeof(*rp) + cap_len); } #ifdef CONFIG_BT_FEATURE_DEBUG /* d4992530-b9ec-469f-ab01-6c481c47da1c */ static const u8 debug_uuid[16] = { 0x1c, 0xda, 0x47, 0x1c, 0x48, 0x6c, 0x01, 0xab, 0x9f, 0x46, 0xec, 0xb9, 0x30, 0x25, 0x99, 0xd4, }; #endif /* 330859bc-7506-492d-9370-9a6f0614037f */ static const u8 quality_report_uuid[16] = { 0x7f, 0x03, 0x14, 0x06, 0x6f, 0x9a, 0x70, 0x93, 0x2d, 0x49, 0x06, 0x75, 0xbc, 0x59, 0x08, 0x33, }; /* a6695ace-ee7f-4fb9-881a-5fac66c629af */ static const u8 offload_codecs_uuid[16] = { 0xaf, 0x29, 0xc6, 0x66, 0xac, 0x5f, 0x1a, 0x88, 0xb9, 0x4f, 0x7f, 0xee, 0xce, 0x5a, 0x69, 0xa6, }; /* 671b10b5-42c0-4696-9227-eb28d1b049d6 */ static const u8 le_simultaneous_roles_uuid[16] = { 0xd6, 0x49, 0xb0, 0xd1, 0x28, 0xeb, 0x27, 0x92, 0x96, 0x46, 0xc0, 0x42, 0xb5, 0x10, 0x1b, 0x67, }; /* 6fbaf188-05e0-496a-9885-d6ddfdb4e03e */ static const u8 iso_socket_uuid[16] = { 0x3e, 0xe0, 0xb4, 0xfd, 0xdd, 0xd6, 0x85, 0x98, 0x6a, 0x49, 0xe0, 0x05, 0x88, 0xf1, 0xba, 0x6f, }; /* 2ce463d7-7a03-4d8d-bf05-5f24e8f36e76 */ static const u8 mgmt_mesh_uuid[16] = { 0x76, 0x6e, 0xf3, 0xe8, 0x24, 0x5f, 0x05, 0xbf, 0x8d, 0x4d, 0x03, 0x7a, 0xd7, 0x63, 0xe4, 0x2c, }; static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_rp_read_exp_features_info *rp; size_t len; u16 idx = 0; u32 flags; int status; bt_dev_dbg(hdev, "sock %p", sk); /* Enough space for 7 features */ len = sizeof(*rp) + (sizeof(rp->features[0]) * 7); rp = kzalloc(len, GFP_KERNEL); if (!rp) return -ENOMEM; #ifdef CONFIG_BT_FEATURE_DEBUG if (!hdev) { flags = bt_dbg_get() ? BIT(0) : 0; memcpy(rp->features[idx].uuid, debug_uuid, 16); rp->features[idx].flags = cpu_to_le32(flags); idx++; } #endif if (hdev && hci_dev_le_state_simultaneous(hdev)) { if (hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES)) flags = BIT(0); else flags = 0; memcpy(rp->features[idx].uuid, le_simultaneous_roles_uuid, 16); rp->features[idx].flags = cpu_to_le32(flags); idx++; } if (hdev && (aosp_has_quality_report(hdev) || hdev->set_quality_report)) { if (hci_dev_test_flag(hdev, HCI_QUALITY_REPORT)) flags = BIT(0); else flags = 0; memcpy(rp->features[idx].uuid, quality_report_uuid, 16); rp->features[idx].flags = cpu_to_le32(flags); idx++; } if (hdev && hdev->get_data_path_id) { if (hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) flags = BIT(0); else flags = 0; memcpy(rp->features[idx].uuid, offload_codecs_uuid, 16); rp->features[idx].flags = cpu_to_le32(flags); idx++; } if (IS_ENABLED(CONFIG_BT_LE)) { flags = iso_enabled() ? BIT(0) : 0; memcpy(rp->features[idx].uuid, iso_socket_uuid, 16); rp->features[idx].flags = cpu_to_le32(flags); idx++; } if (hdev && lmp_le_capable(hdev)) { if (hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL)) flags = BIT(0); else flags = 0; memcpy(rp->features[idx].uuid, mgmt_mesh_uuid, 16); rp->features[idx].flags = cpu_to_le32(flags); idx++; } rp->feature_count = cpu_to_le16(idx); /* After reading the experimental features information, enable * the events to update client on any future change. */ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); status = mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, MGMT_OP_READ_EXP_FEATURES_INFO, 0, rp, sizeof(*rp) + (20 * idx)); kfree(rp); return status; } static int exp_feature_changed(struct hci_dev *hdev, const u8 *uuid, bool enabled, struct sock *skip) { struct mgmt_ev_exp_feature_changed ev; memset(&ev, 0, sizeof(ev)); memcpy(ev.uuid, uuid, 16); ev.flags = cpu_to_le32(enabled ? BIT(0) : 0); return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, hdev, &ev, sizeof(ev), HCI_MGMT_EXP_FEATURE_EVENTS, skip); } #define EXP_FEAT(_uuid, _set_func) \ { \ .uuid = _uuid, \ .set_func = _set_func, \ } /* The zero key uuid is special. Multiple exp features are set through it. */ static int set_zero_key_func(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_set_exp_feature *cp, u16 data_len) { struct mgmt_rp_set_exp_feature rp; memset(rp.uuid, 0, 16); rp.flags = cpu_to_le32(0); #ifdef CONFIG_BT_FEATURE_DEBUG if (!hdev) { bool changed = bt_dbg_get(); bt_dbg_set(false); if (changed) exp_feature_changed(NULL, ZERO_KEY, false, sk); } #endif hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, 0, &rp, sizeof(rp)); } #ifdef CONFIG_BT_FEATURE_DEBUG static int set_debug_func(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_set_exp_feature *cp, u16 data_len) { struct mgmt_rp_set_exp_feature rp; bool val, changed; int err; /* Command requires to use the non-controller index */ if (hdev) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_INDEX); /* Parameters are limited to a single octet */ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) return mgmt_cmd_status(sk, MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_PARAMS); /* Only boolean on/off is supported */ if (cp->param[0] != 0x00 && cp->param[0] != 0x01) return mgmt_cmd_status(sk, MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_PARAMS); val = !!cp->param[0]; changed = val ? !bt_dbg_get() : bt_dbg_get(); bt_dbg_set(val); memcpy(rp.uuid, debug_uuid, 16); rp.flags = cpu_to_le32(val ? BIT(0) : 0); hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, 0, &rp, sizeof(rp)); if (changed) exp_feature_changed(hdev, debug_uuid, val, sk); return err; } #endif static int set_mgmt_mesh_func(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_set_exp_feature *cp, u16 data_len) { struct mgmt_rp_set_exp_feature rp; bool val, changed; int err; /* Command requires to use the controller index */ if (!hdev) return mgmt_cmd_status(sk, MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_INDEX); /* Parameters are limited to a single octet */ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_PARAMS); /* Only boolean on/off is supported */ if (cp->param[0] != 0x00 && cp->param[0] != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_PARAMS); val = !!cp->param[0]; if (val) { changed = !hci_dev_test_and_set_flag(hdev, HCI_MESH_EXPERIMENTAL); } else { hci_dev_clear_flag(hdev, HCI_MESH); changed = hci_dev_test_and_clear_flag(hdev, HCI_MESH_EXPERIMENTAL); } memcpy(rp.uuid, mgmt_mesh_uuid, 16); rp.flags = cpu_to_le32(val ? BIT(0) : 0); hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, 0, &rp, sizeof(rp)); if (changed) exp_feature_changed(hdev, mgmt_mesh_uuid, val, sk); return err; } static int set_quality_report_func(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_set_exp_feature *cp, u16 data_len) { struct mgmt_rp_set_exp_feature rp; bool val, changed; int err; /* Command requires to use a valid controller index */ if (!hdev) return mgmt_cmd_status(sk, MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_INDEX); /* Parameters are limited to a single octet */ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_PARAMS); /* Only boolean on/off is supported */ if (cp->param[0] != 0x00 && cp->param[0] != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_PARAMS); hci_req_sync_lock(hdev); val = !!cp->param[0]; changed = (val != hci_dev_test_flag(hdev, HCI_QUALITY_REPORT)); if (!aosp_has_quality_report(hdev) && !hdev->set_quality_report) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_NOT_SUPPORTED); goto unlock_quality_report; } if (changed) { if (hdev->set_quality_report) err = hdev->set_quality_report(hdev, val); else err = aosp_set_quality_report(hdev, val); if (err) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_FAILED); goto unlock_quality_report; } if (val) hci_dev_set_flag(hdev, HCI_QUALITY_REPORT); else hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT); } bt_dev_dbg(hdev, "quality report enable %d changed %d", val, changed); memcpy(rp.uuid, quality_report_uuid, 16); rp.flags = cpu_to_le32(val ? BIT(0) : 0); hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, 0, &rp, sizeof(rp)); if (changed) exp_feature_changed(hdev, quality_report_uuid, val, sk); unlock_quality_report: hci_req_sync_unlock(hdev); return err; } static int set_offload_codec_func(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_set_exp_feature *cp, u16 data_len) { bool val, changed; int err; struct mgmt_rp_set_exp_feature rp; /* Command requires to use a valid controller index */ if (!hdev) return mgmt_cmd_status(sk, MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_INDEX); /* Parameters are limited to a single octet */ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_PARAMS); /* Only boolean on/off is supported */ if (cp->param[0] != 0x00 && cp->param[0] != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_PARAMS); val = !!cp->param[0]; changed = (val != hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)); if (!hdev->get_data_path_id) { return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_NOT_SUPPORTED); } if (changed) { if (val) hci_dev_set_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED); else hci_dev_clear_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED); } bt_dev_info(hdev, "offload codecs enable %d changed %d", val, changed); memcpy(rp.uuid, offload_codecs_uuid, 16); rp.flags = cpu_to_le32(val ? BIT(0) : 0); hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, 0, &rp, sizeof(rp)); if (changed) exp_feature_changed(hdev, offload_codecs_uuid, val, sk); return err; } static int set_le_simultaneous_roles_func(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_set_exp_feature *cp, u16 data_len) { bool val, changed; int err; struct mgmt_rp_set_exp_feature rp; /* Command requires to use a valid controller index */ if (!hdev) return mgmt_cmd_status(sk, MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_INDEX); /* Parameters are limited to a single octet */ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_PARAMS); /* Only boolean on/off is supported */ if (cp->param[0] != 0x00 && cp->param[0] != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_PARAMS); val = !!cp->param[0]; changed = (val != hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES)); if (!hci_dev_le_state_simultaneous(hdev)) { return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_NOT_SUPPORTED); } if (changed) { if (val) hci_dev_set_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES); else hci_dev_clear_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES); } bt_dev_info(hdev, "LE simultaneous roles enable %d changed %d", val, changed); memcpy(rp.uuid, le_simultaneous_roles_uuid, 16); rp.flags = cpu_to_le32(val ? BIT(0) : 0); hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, 0, &rp, sizeof(rp)); if (changed) exp_feature_changed(hdev, le_simultaneous_roles_uuid, val, sk); return err; } #ifdef CONFIG_BT_LE static int set_iso_socket_func(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_set_exp_feature *cp, u16 data_len) { struct mgmt_rp_set_exp_feature rp; bool val, changed = false; int err; /* Command requires to use the non-controller index */ if (hdev) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_INDEX); /* Parameters are limited to a single octet */ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) return mgmt_cmd_status(sk, MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_PARAMS); /* Only boolean on/off is supported */ if (cp->param[0] != 0x00 && cp->param[0] != 0x01) return mgmt_cmd_status(sk, MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_INVALID_PARAMS); val = cp->param[0] ? true : false; if (val) err = iso_init(); else err = iso_exit(); if (!err) changed = true; memcpy(rp.uuid, iso_socket_uuid, 16); rp.flags = cpu_to_le32(val ? BIT(0) : 0); hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, 0, &rp, sizeof(rp)); if (changed) exp_feature_changed(hdev, iso_socket_uuid, val, sk); return err; } #endif static const struct mgmt_exp_feature { const u8 *uuid; int (*set_func)(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_set_exp_feature *cp, u16 data_len); } exp_features[] = { EXP_FEAT(ZERO_KEY, set_zero_key_func), #ifdef CONFIG_BT_FEATURE_DEBUG EXP_FEAT(debug_uuid, set_debug_func), #endif EXP_FEAT(mgmt_mesh_uuid, set_mgmt_mesh_func), EXP_FEAT(quality_report_uuid, set_quality_report_func), EXP_FEAT(offload_codecs_uuid, set_offload_codec_func), EXP_FEAT(le_simultaneous_roles_uuid, set_le_simultaneous_roles_func), #ifdef CONFIG_BT_LE EXP_FEAT(iso_socket_uuid, set_iso_socket_func), #endif /* end with a null feature */ EXP_FEAT(NULL, NULL) }; static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_cp_set_exp_feature *cp = data; size_t i = 0; bt_dev_dbg(hdev, "sock %p", sk); for (i = 0; exp_features[i].uuid; i++) { if (!memcmp(cp->uuid, exp_features[i].uuid, 16)) return exp_features[i].set_func(sk, hdev, cp, data_len); } return mgmt_cmd_status(sk, hdev ? hdev->id : MGMT_INDEX_NONE, MGMT_OP_SET_EXP_FEATURE, MGMT_STATUS_NOT_SUPPORTED); } static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_cp_get_device_flags *cp = data; struct mgmt_rp_get_device_flags rp; struct bdaddr_list_with_flags *br_params; struct hci_conn_params *params; u32 supported_flags; u32 current_flags = 0; u8 status = MGMT_STATUS_INVALID_PARAMS; bt_dev_dbg(hdev, "Get device flags %pMR (type 0x%x)\n", &cp->addr.bdaddr, cp->addr.type); hci_dev_lock(hdev); supported_flags = hdev->conn_flags; memset(&rp, 0, sizeof(rp)); if (cp->addr.type == BDADDR_BREDR) { br_params = hci_bdaddr_list_lookup_with_flags(&hdev->accept_list, &cp->addr.bdaddr, cp->addr.type); if (!br_params) goto done; current_flags = br_params->flags; } else { params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, le_addr_type(cp->addr.type)); if (!params) goto done; current_flags = params->flags; } bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); rp.addr.type = cp->addr.type; rp.supported_flags = cpu_to_le32(supported_flags); rp.current_flags = cpu_to_le32(current_flags); status = MGMT_STATUS_SUCCESS; done: hci_dev_unlock(hdev); return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_DEVICE_FLAGS, status, &rp, sizeof(rp)); } static void device_flags_changed(struct sock *sk, struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type, u32 supported_flags, u32 current_flags) { struct mgmt_ev_device_flags_changed ev; bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = bdaddr_type; ev.supported_flags = cpu_to_le32(supported_flags); ev.current_flags = cpu_to_le32(current_flags); mgmt_event(MGMT_EV_DEVICE_FLAGS_CHANGED, hdev, &ev, sizeof(ev), sk); } static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_device_flags *cp = data; struct bdaddr_list_with_flags *br_params; struct hci_conn_params *params; u8 status = MGMT_STATUS_INVALID_PARAMS; u32 supported_flags; u32 current_flags = __le32_to_cpu(cp->current_flags); bt_dev_dbg(hdev, "Set device flags %pMR (type 0x%x) = 0x%x", &cp->addr.bdaddr, cp->addr.type, current_flags); // We should take hci_dev_lock() early, I think.. conn_flags can change supported_flags = hdev->conn_flags; if ((supported_flags | current_flags) != supported_flags) { bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)", current_flags, supported_flags); goto done; } hci_dev_lock(hdev); if (cp->addr.type == BDADDR_BREDR) { br_params = hci_bdaddr_list_lookup_with_flags(&hdev->accept_list, &cp->addr.bdaddr, cp->addr.type); if (br_params) { br_params->flags = current_flags; status = MGMT_STATUS_SUCCESS; } else { bt_dev_warn(hdev, "No such BR/EDR device %pMR (0x%x)", &cp->addr.bdaddr, cp->addr.type); } goto unlock; } params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, le_addr_type(cp->addr.type)); if (!params) { bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", &cp->addr.bdaddr, le_addr_type(cp->addr.type)); goto unlock; } supported_flags = hdev->conn_flags; if ((supported_flags | current_flags) != supported_flags) { bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)", current_flags, supported_flags); goto unlock; } WRITE_ONCE(params->flags, current_flags); status = MGMT_STATUS_SUCCESS; /* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY * has been set. */ if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY) hci_update_passive_scan(hdev); unlock: hci_dev_unlock(hdev); done: if (status == MGMT_STATUS_SUCCESS) device_flags_changed(sk, hdev, &cp->addr.bdaddr, cp->addr.type, supported_flags, current_flags); return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEVICE_FLAGS, status, &cp->addr, sizeof(cp->addr)); } static void mgmt_adv_monitor_added(struct sock *sk, struct hci_dev *hdev, u16 handle) { struct mgmt_ev_adv_monitor_added ev; ev.monitor_handle = cpu_to_le16(handle); mgmt_event(MGMT_EV_ADV_MONITOR_ADDED, hdev, &ev, sizeof(ev), sk); } void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle) { struct mgmt_ev_adv_monitor_removed ev; struct mgmt_pending_cmd *cmd; struct sock *sk_skip = NULL; struct mgmt_cp_remove_adv_monitor *cp; cmd = pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev); if (cmd) { cp = cmd->param; if (cp->monitor_handle) sk_skip = cmd->sk; } ev.monitor_handle = cpu_to_le16(handle); mgmt_event(MGMT_EV_ADV_MONITOR_REMOVED, hdev, &ev, sizeof(ev), sk_skip); } static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct adv_monitor *monitor = NULL; struct mgmt_rp_read_adv_monitor_features *rp = NULL; int handle, err; size_t rp_size = 0; __u32 supported = 0; __u32 enabled = 0; __u16 num_handles = 0; __u16 handles[HCI_MAX_ADV_MONITOR_NUM_HANDLES]; BT_DBG("request for %s", hdev->name); hci_dev_lock(hdev); if (msft_monitor_supported(hdev)) supported |= MGMT_ADV_MONITOR_FEATURE_MASK_OR_PATTERNS; idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) handles[num_handles++] = monitor->handle; hci_dev_unlock(hdev); rp_size = sizeof(*rp) + (num_handles * sizeof(u16)); rp = kmalloc(rp_size, GFP_KERNEL); if (!rp) return -ENOMEM; /* All supported features are currently enabled */ enabled = supported; rp->supported_features = cpu_to_le32(supported); rp->enabled_features = cpu_to_le32(enabled); rp->max_num_handles = cpu_to_le16(HCI_MAX_ADV_MONITOR_NUM_HANDLES); rp->max_num_patterns = HCI_MAX_ADV_MONITOR_NUM_PATTERNS; rp->num_handles = cpu_to_le16(num_handles); if (num_handles) memcpy(&rp->handles, &handles, (num_handles * sizeof(u16))); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_ADV_MONITOR_FEATURES, MGMT_STATUS_SUCCESS, rp, rp_size); kfree(rp); return err; } static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, void *data, int status) { struct mgmt_rp_add_adv_patterns_monitor rp; struct mgmt_pending_cmd *cmd = data; struct adv_monitor *monitor = cmd->user_data; hci_dev_lock(hdev); rp.monitor_handle = cpu_to_le16(monitor->handle); if (!status) { mgmt_adv_monitor_added(cmd->sk, hdev, monitor->handle); hdev->adv_monitors_cnt++; if (monitor->state == ADV_MONITOR_STATE_NOT_REGISTERED) monitor->state = ADV_MONITOR_STATE_REGISTERED; hci_update_passive_scan(hdev); } mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(status), &rp, sizeof(rp)); mgmt_pending_remove(cmd); hci_dev_unlock(hdev); bt_dev_dbg(hdev, "add monitor %d complete, status %d", rp.monitor_handle, status); } static int mgmt_add_adv_patterns_monitor_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct adv_monitor *monitor = cmd->user_data; return hci_add_adv_monitor(hdev, monitor); } static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, struct adv_monitor *m, u8 status, void *data, u16 len, u16 op) { struct mgmt_pending_cmd *cmd; int err; hci_dev_lock(hdev); if (status) goto unlock; if (pending_find(MGMT_OP_SET_LE, hdev) || pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) || pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev) || pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev)) { status = MGMT_STATUS_BUSY; goto unlock; } cmd = mgmt_pending_add(sk, op, hdev, data, len); if (!cmd) { status = MGMT_STATUS_NO_RESOURCES; goto unlock; } cmd->user_data = m; err = hci_cmd_sync_queue(hdev, mgmt_add_adv_patterns_monitor_sync, cmd, mgmt_add_adv_patterns_monitor_complete); if (err) { if (err == -ENOMEM) status = MGMT_STATUS_NO_RESOURCES; else status = MGMT_STATUS_FAILED; goto unlock; } hci_dev_unlock(hdev); return 0; unlock: hci_free_adv_monitor(hdev, m); hci_dev_unlock(hdev); return mgmt_cmd_status(sk, hdev->id, op, status); } static void parse_adv_monitor_rssi(struct adv_monitor *m, struct mgmt_adv_rssi_thresholds *rssi) { if (rssi) { m->rssi.low_threshold = rssi->low_threshold; m->rssi.low_threshold_timeout = __le16_to_cpu(rssi->low_threshold_timeout); m->rssi.high_threshold = rssi->high_threshold; m->rssi.high_threshold_timeout = __le16_to_cpu(rssi->high_threshold_timeout); m->rssi.sampling_period = rssi->sampling_period; } else { /* Default values. These numbers are the least constricting * parameters for MSFT API to work, so it behaves as if there * are no rssi parameter to consider. May need to be changed * if other API are to be supported. */ m->rssi.low_threshold = -127; m->rssi.low_threshold_timeout = 60; m->rssi.high_threshold = -127; m->rssi.high_threshold_timeout = 0; m->rssi.sampling_period = 0; } } static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count, struct mgmt_adv_pattern *patterns) { u8 offset = 0, length = 0; struct adv_pattern *p = NULL; int i; for (i = 0; i < pattern_count; i++) { offset = patterns[i].offset; length = patterns[i].length; if (offset >= HCI_MAX_EXT_AD_LENGTH || length > HCI_MAX_EXT_AD_LENGTH || (offset + length) > HCI_MAX_EXT_AD_LENGTH) return MGMT_STATUS_INVALID_PARAMS; p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) return MGMT_STATUS_NO_RESOURCES; p->ad_type = patterns[i].ad_type; p->offset = patterns[i].offset; p->length = patterns[i].length; memcpy(p->value, patterns[i].value, p->length); INIT_LIST_HEAD(&p->list); list_add(&p->list, &m->patterns); } return MGMT_STATUS_SUCCESS; } static int add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_add_adv_patterns_monitor *cp = data; struct adv_monitor *m = NULL; u8 status = MGMT_STATUS_SUCCESS; size_t expected_size = sizeof(*cp); BT_DBG("request for %s", hdev->name); if (len <= sizeof(*cp)) { status = MGMT_STATUS_INVALID_PARAMS; goto done; } expected_size += cp->pattern_count * sizeof(struct mgmt_adv_pattern); if (len != expected_size) { status = MGMT_STATUS_INVALID_PARAMS; goto done; } m = kzalloc(sizeof(*m), GFP_KERNEL); if (!m) { status = MGMT_STATUS_NO_RESOURCES; goto done; } INIT_LIST_HEAD(&m->patterns); parse_adv_monitor_rssi(m, NULL); status = parse_adv_monitor_pattern(m, cp->pattern_count, cp->patterns); done: return __add_adv_patterns_monitor(sk, hdev, m, status, data, len, MGMT_OP_ADD_ADV_PATTERNS_MONITOR); } static int add_adv_patterns_monitor_rssi(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_add_adv_patterns_monitor_rssi *cp = data; struct adv_monitor *m = NULL; u8 status = MGMT_STATUS_SUCCESS; size_t expected_size = sizeof(*cp); BT_DBG("request for %s", hdev->name); if (len <= sizeof(*cp)) { status = MGMT_STATUS_INVALID_PARAMS; goto done; } expected_size += cp->pattern_count * sizeof(struct mgmt_adv_pattern); if (len != expected_size) { status = MGMT_STATUS_INVALID_PARAMS; goto done; } m = kzalloc(sizeof(*m), GFP_KERNEL); if (!m) { status = MGMT_STATUS_NO_RESOURCES; goto done; } INIT_LIST_HEAD(&m->patterns); parse_adv_monitor_rssi(m, &cp->rssi); status = parse_adv_monitor_pattern(m, cp->pattern_count, cp->patterns); done: return __add_adv_patterns_monitor(sk, hdev, m, status, data, len, MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI); } static void mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, void *data, int status) { struct mgmt_rp_remove_adv_monitor rp; struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_remove_adv_monitor *cp; if (status == -ECANCELED || cmd != pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev)) return; hci_dev_lock(hdev); cp = cmd->param; rp.monitor_handle = cp->monitor_handle; if (!status) hci_update_passive_scan(hdev); mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(status), &rp, sizeof(rp)); mgmt_pending_remove(cmd); hci_dev_unlock(hdev); bt_dev_dbg(hdev, "remove monitor %d complete, status %d", rp.monitor_handle, status); } static int mgmt_remove_adv_monitor_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; if (cmd != pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev)) return -ECANCELED; struct mgmt_cp_remove_adv_monitor *cp = cmd->param; u16 handle = __le16_to_cpu(cp->monitor_handle); if (!handle) return hci_remove_all_adv_monitor(hdev); return hci_remove_single_adv_monitor(hdev, handle); } static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_pending_cmd *cmd; int err, status; hci_dev_lock(hdev); if (pending_find(MGMT_OP_SET_LE, hdev) || pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev) || pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) || pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev)) { status = MGMT_STATUS_BUSY; goto unlock; } cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_ADV_MONITOR, hdev, data, len); if (!cmd) { status = MGMT_STATUS_NO_RESOURCES; goto unlock; } err = hci_cmd_sync_submit(hdev, mgmt_remove_adv_monitor_sync, cmd, mgmt_remove_adv_monitor_complete); if (err) { mgmt_pending_remove(cmd); if (err == -ENOMEM) status = MGMT_STATUS_NO_RESOURCES; else status = MGMT_STATUS_FAILED; goto unlock; } hci_dev_unlock(hdev); return 0; unlock: hci_dev_unlock(hdev); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADV_MONITOR, status); } static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_rp_read_local_oob_data mgmt_rp; size_t rp_size = sizeof(mgmt_rp); struct mgmt_pending_cmd *cmd = data; struct sk_buff *skb = cmd->skb; u8 status = mgmt_status(err); if (!status) { if (!skb) status = MGMT_STATUS_FAILED; else if (IS_ERR(skb)) status = mgmt_status(PTR_ERR(skb)); else status = mgmt_status(skb->data[0]); } bt_dev_dbg(hdev, "status %d", status); if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, status); goto remove; } memset(&mgmt_rp, 0, sizeof(mgmt_rp)); if (!bredr_sc_enabled(hdev)) { struct hci_rp_read_local_oob_data *rp = (void *) skb->data; if (skb->len < sizeof(*rp)) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, MGMT_STATUS_FAILED); goto remove; } memcpy(mgmt_rp.hash192, rp->hash, sizeof(rp->hash)); memcpy(mgmt_rp.rand192, rp->rand, sizeof(rp->rand)); rp_size -= sizeof(mgmt_rp.hash256) + sizeof(mgmt_rp.rand256); } else { struct hci_rp_read_local_oob_ext_data *rp = (void *) skb->data; if (skb->len < sizeof(*rp)) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, MGMT_STATUS_FAILED); goto remove; } memcpy(mgmt_rp.hash192, rp->hash192, sizeof(rp->hash192)); memcpy(mgmt_rp.rand192, rp->rand192, sizeof(rp->rand192)); memcpy(mgmt_rp.hash256, rp->hash256, sizeof(rp->hash256)); memcpy(mgmt_rp.rand256, rp->rand256, sizeof(rp->rand256)); } mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, MGMT_STATUS_SUCCESS, &mgmt_rp, rp_size); remove: if (skb && !IS_ERR(skb)) kfree_skb(skb); mgmt_pending_free(cmd); } static int read_local_oob_data_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; if (bredr_sc_enabled(hdev)) cmd->skb = hci_read_local_oob_data_sync(hdev, true, cmd->sk); else cmd->skb = hci_read_local_oob_data_sync(hdev, false, cmd->sk); if (IS_ERR(cmd->skb)) return PTR_ERR(cmd->skb); else return 0; } static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_pending_cmd *cmd; int err; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, MGMT_STATUS_NOT_POWERED); goto unlock; } if (!lmp_ssp_capable(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, MGMT_STATUS_NOT_SUPPORTED); goto unlock; } cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_DATA, hdev, NULL, 0); if (!cmd) err = -ENOMEM; else err = hci_cmd_sync_queue(hdev, read_local_oob_data_sync, cmd, read_local_oob_data_complete); if (err < 0) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, MGMT_STATUS_FAILED); if (cmd) mgmt_pending_free(cmd); } unlock: hci_dev_unlock(hdev); return err; } static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_addr_info *addr = data; int err; bt_dev_dbg(hdev, "sock %p", sk); if (!bdaddr_type_is_valid(addr->type)) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, MGMT_STATUS_INVALID_PARAMS, addr, sizeof(*addr)); hci_dev_lock(hdev); if (len == MGMT_ADD_REMOTE_OOB_DATA_SIZE) { struct mgmt_cp_add_remote_oob_data *cp = data; u8 status; if (cp->addr.type != BDADDR_BREDR) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); goto unlock; } err = hci_add_remote_oob_data(hdev, &cp->addr.bdaddr, cp->addr.type, cp->hash, cp->rand, NULL, NULL); if (err < 0) status = MGMT_STATUS_FAILED; else status = MGMT_STATUS_SUCCESS; err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, status, &cp->addr, sizeof(cp->addr)); } else if (len == MGMT_ADD_REMOTE_OOB_EXT_DATA_SIZE) { struct mgmt_cp_add_remote_oob_ext_data *cp = data; u8 *rand192, *hash192, *rand256, *hash256; u8 status; if (bdaddr_type_is_le(cp->addr.type)) { /* Enforce zero-valued 192-bit parameters as * long as legacy SMP OOB isn't implemented. */ if (memcmp(cp->rand192, ZERO_KEY, 16) || memcmp(cp->hash192, ZERO_KEY, 16)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, MGMT_STATUS_INVALID_PARAMS, addr, sizeof(*addr)); goto unlock; } rand192 = NULL; hash192 = NULL; } else { /* In case one of the P-192 values is set to zero, * then just disable OOB data for P-192. */ if (!memcmp(cp->rand192, ZERO_KEY, 16) || !memcmp(cp->hash192, ZERO_KEY, 16)) { rand192 = NULL; hash192 = NULL; } else { rand192 = cp->rand192; hash192 = cp->hash192; } } /* In case one of the P-256 values is set to zero, then just * disable OOB data for P-256. */ if (!memcmp(cp->rand256, ZERO_KEY, 16) || !memcmp(cp->hash256, ZERO_KEY, 16)) { rand256 = NULL; hash256 = NULL; } else { rand256 = cp->rand256; hash256 = cp->hash256; } err = hci_add_remote_oob_data(hdev, &cp->addr.bdaddr, cp->addr.type, hash192, rand192, hash256, rand256); if (err < 0) status = MGMT_STATUS_FAILED; else status = MGMT_STATUS_SUCCESS; err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, status, &cp->addr, sizeof(cp->addr)); } else { bt_dev_err(hdev, "add_remote_oob_data: invalid len of %u bytes", len); err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_REMOTE_OOB_DATA, MGMT_STATUS_INVALID_PARAMS); } unlock: hci_dev_unlock(hdev); return err; } static int remove_remote_oob_data(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_remove_remote_oob_data *cp = data; u8 status; int err; bt_dev_dbg(hdev, "sock %p", sk); if (cp->addr.type != BDADDR_BREDR) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_REMOTE_OOB_DATA, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); hci_dev_lock(hdev); if (!bacmp(&cp->addr.bdaddr, BDADDR_ANY)) { hci_remote_oob_data_clear(hdev); status = MGMT_STATUS_SUCCESS; goto done; } err = hci_remove_remote_oob_data(hdev, &cp->addr.bdaddr, cp->addr.type); if (err < 0) status = MGMT_STATUS_INVALID_PARAMS; else status = MGMT_STATUS_SUCCESS; done: err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_REMOTE_OOB_DATA, status, &cp->addr, sizeof(cp->addr)); hci_dev_unlock(hdev); return err; } static bool discovery_type_is_valid(struct hci_dev *hdev, uint8_t type, uint8_t *mgmt_status) { switch (type) { case DISCOV_TYPE_LE: *mgmt_status = mgmt_le_support(hdev); if (*mgmt_status) return false; break; case DISCOV_TYPE_INTERLEAVED: *mgmt_status = mgmt_le_support(hdev); if (*mgmt_status) return false; fallthrough; case DISCOV_TYPE_BREDR: *mgmt_status = mgmt_bredr_support(hdev); if (*mgmt_status) return false; break; default: *mgmt_status = MGMT_STATUS_INVALID_PARAMS; return false; } return true; } static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; bt_dev_dbg(hdev, "err %d", err); if (err == -ECANCELED) return; if (cmd != pending_find(MGMT_OP_START_DISCOVERY, hdev) && cmd != pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev) && cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) return; mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), cmd->param, 1); mgmt_pending_remove(cmd); hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED: DISCOVERY_FINDING); } static int start_discovery_sync(struct hci_dev *hdev, void *data) { return hci_start_discovery_sync(hdev); } static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, u16 op, void *data, u16 len) { struct mgmt_cp_start_discovery *cp = data; struct mgmt_pending_cmd *cmd; u8 status; int err; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { err = mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_NOT_POWERED, &cp->type, sizeof(cp->type)); goto failed; } if (hdev->discovery.state != DISCOVERY_STOPPED || hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) { err = mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_BUSY, &cp->type, sizeof(cp->type)); goto failed; } if (!discovery_type_is_valid(hdev, cp->type, &status)) { err = mgmt_cmd_complete(sk, hdev->id, op, status, &cp->type, sizeof(cp->type)); goto failed; } /* Can't start discovery when it is paused */ if (hdev->discovery_paused) { err = mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_BUSY, &cp->type, sizeof(cp->type)); goto failed; } /* Clear the discovery filter first to free any previously * allocated memory for the UUID list. */ hci_discovery_filter_clear(hdev); hdev->discovery.type = cp->type; hdev->discovery.report_invalid_rssi = false; if (op == MGMT_OP_START_LIMITED_DISCOVERY) hdev->discovery.limited = true; else hdev->discovery.limited = false; cmd = mgmt_pending_add(sk, op, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; } err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd, start_discovery_complete); if (err < 0) { mgmt_pending_remove(cmd); goto failed; } hci_discovery_set_state(hdev, DISCOVERY_STARTING); failed: hci_dev_unlock(hdev); return err; } static int start_discovery(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { return start_discovery_internal(sk, hdev, MGMT_OP_START_DISCOVERY, data, len); } static int start_limited_discovery(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { return start_discovery_internal(sk, hdev, MGMT_OP_START_LIMITED_DISCOVERY, data, len); } static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_start_service_discovery *cp = data; struct mgmt_pending_cmd *cmd; const u16 max_uuid_count = ((U16_MAX - sizeof(*cp)) / 16); u16 uuid_count, expected_len; u8 status; int err; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_SERVICE_DISCOVERY, MGMT_STATUS_NOT_POWERED, &cp->type, sizeof(cp->type)); goto failed; } if (hdev->discovery.state != DISCOVERY_STOPPED || hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_SERVICE_DISCOVERY, MGMT_STATUS_BUSY, &cp->type, sizeof(cp->type)); goto failed; } if (hdev->discovery_paused) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_SERVICE_DISCOVERY, MGMT_STATUS_BUSY, &cp->type, sizeof(cp->type)); goto failed; } uuid_count = __le16_to_cpu(cp->uuid_count); if (uuid_count > max_uuid_count) { bt_dev_err(hdev, "service_discovery: too big uuid_count value %u", uuid_count); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_SERVICE_DISCOVERY, MGMT_STATUS_INVALID_PARAMS, &cp->type, sizeof(cp->type)); goto failed; } expected_len = sizeof(*cp) + uuid_count * 16; if (expected_len != len) { bt_dev_err(hdev, "service_discovery: expected %u bytes, got %u bytes", expected_len, len); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_SERVICE_DISCOVERY, MGMT_STATUS_INVALID_PARAMS, &cp->type, sizeof(cp->type)); goto failed; } if (!discovery_type_is_valid(hdev, cp->type, &status)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_SERVICE_DISCOVERY, status, &cp->type, sizeof(cp->type)); goto failed; } cmd = mgmt_pending_add(sk, MGMT_OP_START_SERVICE_DISCOVERY, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; } /* Clear the discovery filter first to free any previously * allocated memory for the UUID list. */ hci_discovery_filter_clear(hdev); hdev->discovery.result_filtering = true; hdev->discovery.type = cp->type; hdev->discovery.rssi = cp->rssi; hdev->discovery.uuid_count = uuid_count; if (uuid_count > 0) { hdev->discovery.uuids = kmemdup(cp->uuids, uuid_count * 16, GFP_KERNEL); if (!hdev->discovery.uuids) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_SERVICE_DISCOVERY, MGMT_STATUS_FAILED, &cp->type, sizeof(cp->type)); mgmt_pending_remove(cmd); goto failed; } } err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd, start_discovery_complete); if (err < 0) { mgmt_pending_remove(cmd); goto failed; } hci_discovery_set_state(hdev, DISCOVERY_STARTING); failed: hci_dev_unlock(hdev); return err; } static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; if (err == -ECANCELED || cmd != pending_find(MGMT_OP_STOP_DISCOVERY, hdev)) return; bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), cmd->param, 1); mgmt_pending_remove(cmd); if (!err) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); } static int stop_discovery_sync(struct hci_dev *hdev, void *data) { return hci_stop_discovery_sync(hdev); } static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_stop_discovery *mgmt_cp = data; struct mgmt_pending_cmd *cmd; int err; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); if (!hci_discovery_active(hdev)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, MGMT_STATUS_REJECTED, &mgmt_cp->type, sizeof(mgmt_cp->type)); goto unlock; } if (hdev->discovery.type != mgmt_cp->type) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, MGMT_STATUS_INVALID_PARAMS, &mgmt_cp->type, sizeof(mgmt_cp->type)); goto unlock; } cmd = mgmt_pending_add(sk, MGMT_OP_STOP_DISCOVERY, hdev, data, len); if (!cmd) { err = -ENOMEM; goto unlock; } err = hci_cmd_sync_queue(hdev, stop_discovery_sync, cmd, stop_discovery_complete); if (err < 0) { mgmt_pending_remove(cmd); goto unlock; } hci_discovery_set_state(hdev, DISCOVERY_STOPPING); unlock: hci_dev_unlock(hdev); return err; } static int confirm_name(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_confirm_name *cp = data; struct inquiry_entry *e; int err; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); if (!hci_discovery_active(hdev)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, MGMT_STATUS_FAILED, &cp->addr, sizeof(cp->addr)); goto failed; } e = hci_inquiry_cache_lookup_unknown(hdev, &cp->addr.bdaddr); if (!e) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); goto failed; } if (cp->name_known) { e->name_state = NAME_KNOWN; list_del(&e->list); } else { e->name_state = NAME_NEEDED; hci_inquiry_cache_update_resolve(hdev, e); } err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CONFIRM_NAME, 0, &cp->addr, sizeof(cp->addr)); failed: hci_dev_unlock(hdev); return err; } static int block_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_block_device *cp = data; u8 status; int err; bt_dev_dbg(hdev, "sock %p", sk); if (!bdaddr_type_is_valid(cp->addr.type)) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); hci_dev_lock(hdev); err = hci_bdaddr_list_add(&hdev->reject_list, &cp->addr.bdaddr, cp->addr.type); if (err < 0) { status = MGMT_STATUS_FAILED; goto done; } mgmt_event(MGMT_EV_DEVICE_BLOCKED, hdev, &cp->addr, sizeof(cp->addr), sk); status = MGMT_STATUS_SUCCESS; done: err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE, status, &cp->addr, sizeof(cp->addr)); hci_dev_unlock(hdev); return err; } static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_unblock_device *cp = data; u8 status; int err; bt_dev_dbg(hdev, "sock %p", sk); if (!bdaddr_type_is_valid(cp->addr.type)) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); hci_dev_lock(hdev); err = hci_bdaddr_list_del(&hdev->reject_list, &cp->addr.bdaddr, cp->addr.type); if (err < 0) { status = MGMT_STATUS_INVALID_PARAMS; goto done; } mgmt_event(MGMT_EV_DEVICE_UNBLOCKED, hdev, &cp->addr, sizeof(cp->addr), sk); status = MGMT_STATUS_SUCCESS; done: err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE, status, &cp->addr, sizeof(cp->addr)); hci_dev_unlock(hdev); return err; } static int set_device_id_sync(struct hci_dev *hdev, void *data) { return hci_update_eir_sync(hdev); } static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_device_id *cp = data; int err; __u16 source; bt_dev_dbg(hdev, "sock %p", sk); source = __le16_to_cpu(cp->source); if (source > 0x0002) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEVICE_ID, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); hdev->devid_source = source; hdev->devid_vendor = __le16_to_cpu(cp->vendor); hdev->devid_product = __le16_to_cpu(cp->product); hdev->devid_version = __le16_to_cpu(cp->version); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEVICE_ID, 0, NULL, 0); hci_cmd_sync_queue(hdev, set_device_id_sync, NULL, NULL); hci_dev_unlock(hdev); return err; } static void enable_advertising_instance(struct hci_dev *hdev, int err) { if (err) bt_dev_err(hdev, "failed to re-configure advertising %d", err); else bt_dev_dbg(hdev, "status %d", err); } static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) { struct cmd_lookup match = { NULL, hdev }; u8 instance; struct adv_info *adv_instance; u8 status = mgmt_status(err); if (status) { mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, cmd_status_rsp, &status); return; } if (hci_dev_test_flag(hdev, HCI_LE_ADV)) hci_dev_set_flag(hdev, HCI_ADVERTISING); else hci_dev_clear_flag(hdev, HCI_ADVERTISING); mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, settings_rsp, &match); new_settings(hdev, match.sk); if (match.sk) sock_put(match.sk); /* If "Set Advertising" was just disabled and instance advertising was * set up earlier, then re-enable multi-instance advertising. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || list_empty(&hdev->adv_instances)) return; instance = hdev->cur_adv_instance; if (!instance) { adv_instance = list_first_entry_or_null(&hdev->adv_instances, struct adv_info, list); if (!adv_instance) return; instance = adv_instance->instance; } err = hci_schedule_adv_instance_sync(hdev, instance, true); enable_advertising_instance(hdev, err); } static int set_adv_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_mode *cp = cmd->param; u8 val = !!cp->val; if (cp->val == 0x02) hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE); else hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE); cancel_adv_timeout(hdev); if (val) { /* Switch to instance "0" for the Set Advertising setting. * We cannot use update_[adv|scan_rsp]_data() here as the * HCI_ADVERTISING flag is not yet set. */ hdev->cur_adv_instance = 0x00; if (ext_adv_capable(hdev)) { hci_start_ext_adv_sync(hdev, 0x00); } else { hci_update_adv_data_sync(hdev, 0x00); hci_update_scan_rsp_data_sync(hdev, 0x00); hci_enable_advertising_sync(hdev); } } else { hci_disable_advertising_sync(hdev); } return 0; } static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; struct mgmt_pending_cmd *cmd; u8 val, status; int err; bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_le_support(hdev); if (status) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, status); if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); if (hdev->advertising_paused) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, MGMT_STATUS_BUSY); hci_dev_lock(hdev); val = !!cp->val; /* The following conditions are ones which mean that we should * not do any HCI communication but directly send a mgmt * response to user space (after toggling the flag if * necessary). */ if (!hdev_is_powered(hdev) || (val == hci_dev_test_flag(hdev, HCI_ADVERTISING) && (cp->val == 0x02) == hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) || hci_dev_test_flag(hdev, HCI_MESH) || hci_conn_num(hdev, LE_LINK) > 0 || (hci_dev_test_flag(hdev, HCI_LE_SCAN) && hdev->le_scan_type == LE_SCAN_ACTIVE)) { bool changed; if (cp->val) { hdev->cur_adv_instance = 0x00; changed = !hci_dev_test_and_set_flag(hdev, HCI_ADVERTISING); if (cp->val == 0x02) hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE); else hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE); } else { changed = hci_dev_test_and_clear_flag(hdev, HCI_ADVERTISING); hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE); } err = send_settings_rsp(sk, MGMT_OP_SET_ADVERTISING, hdev); if (err < 0) goto unlock; if (changed) err = new_settings(hdev, sk); goto unlock; } if (pending_find(MGMT_OP_SET_ADVERTISING, hdev) || pending_find(MGMT_OP_SET_LE, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, MGMT_STATUS_BUSY); goto unlock; } cmd = mgmt_pending_add(sk, MGMT_OP_SET_ADVERTISING, hdev, data, len); if (!cmd) err = -ENOMEM; else err = hci_cmd_sync_queue(hdev, set_adv_sync, cmd, set_advertising_complete); if (err < 0 && cmd) mgmt_pending_remove(cmd); unlock: hci_dev_unlock(hdev); return err; } static int set_static_address(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_static_address *cp = data; int err; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS, MGMT_STATUS_NOT_SUPPORTED); if (hdev_is_powered(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS, MGMT_STATUS_REJECTED); if (bacmp(&cp->bdaddr, BDADDR_ANY)) { if (!bacmp(&cp->bdaddr, BDADDR_NONE)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS, MGMT_STATUS_INVALID_PARAMS); /* Two most significant bits shall be set */ if ((cp->bdaddr.b[5] & 0xc0) != 0xc0) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS, MGMT_STATUS_INVALID_PARAMS); } hci_dev_lock(hdev); bacpy(&hdev->static_addr, &cp->bdaddr); err = send_settings_rsp(sk, MGMT_OP_SET_STATIC_ADDRESS, hdev); if (err < 0) goto unlock; err = new_settings(hdev, sk); unlock: hci_dev_unlock(hdev); return err; } static int set_scan_params(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_scan_params *cp = data; __u16 interval, window; int err; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, MGMT_STATUS_NOT_SUPPORTED); interval = __le16_to_cpu(cp->interval); if (interval < 0x0004 || interval > 0x4000) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, MGMT_STATUS_INVALID_PARAMS); window = __le16_to_cpu(cp->window); if (window < 0x0004 || window > 0x4000) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, MGMT_STATUS_INVALID_PARAMS); if (window > interval) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); hdev->le_scan_interval = interval; hdev->le_scan_window = window; err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, 0, NULL, 0); /* If background scan is running, restart it so new parameters are * loaded. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) && hdev->discovery.state == DISCOVERY_STOPPED) hci_update_passive_scan(hdev); hci_dev_unlock(hdev); return err; } static void fast_connectable_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; bt_dev_dbg(hdev, "err %d", err); if (err) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, mgmt_status(err)); } else { struct mgmt_mode *cp = cmd->param; if (cp->val) hci_dev_set_flag(hdev, HCI_FAST_CONNECTABLE); else hci_dev_clear_flag(hdev, HCI_FAST_CONNECTABLE); send_settings_rsp(cmd->sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev); new_settings(hdev, cmd->sk); } mgmt_pending_free(cmd); } static int write_fast_connectable_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_mode *cp = cmd->param; return hci_write_fast_connectable_sync(hdev, cp->val); } static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; struct mgmt_pending_cmd *cmd; int err; bt_dev_dbg(hdev, "sock %p", sk); if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) || hdev->hci_ver < BLUETOOTH_VER_1_2) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, MGMT_STATUS_NOT_SUPPORTED); if (cp->val != 0x00 && cp->val != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (!!cp->val == hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE)) { err = send_settings_rsp(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev); goto unlock; } if (!hdev_is_powered(hdev)) { hci_dev_change_flag(hdev, HCI_FAST_CONNECTABLE); err = send_settings_rsp(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev); new_settings(hdev, sk); goto unlock; } cmd = mgmt_pending_new(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev, data, len); if (!cmd) err = -ENOMEM; else err = hci_cmd_sync_queue(hdev, write_fast_connectable_sync, cmd, fast_connectable_complete); if (err < 0) { mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE, MGMT_STATUS_FAILED); if (cmd) mgmt_pending_free(cmd); } unlock: hci_dev_unlock(hdev); return err; } static void set_bredr_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; bt_dev_dbg(hdev, "err %d", err); if (err) { u8 mgmt_err = mgmt_status(err); /* We need to restore the flag if related HCI commands * failed. */ hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED); mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); } else { send_settings_rsp(cmd->sk, MGMT_OP_SET_BREDR, hdev); new_settings(hdev, cmd->sk); } mgmt_pending_free(cmd); } static int set_bredr_sync(struct hci_dev *hdev, void *data) { int status; status = hci_write_fast_connectable_sync(hdev, false); if (!status) status = hci_update_scan_sync(hdev); /* Since only the advertising data flags will change, there * is no need to update the scan response data. */ if (!status) status = hci_update_adv_data_sync(hdev, hdev->cur_adv_instance); return status; } static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; struct mgmt_pending_cmd *cmd; int err; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_bredr_capable(hdev) || !lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, MGMT_STATUS_NOT_SUPPORTED); if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, MGMT_STATUS_REJECTED); if (cp->val != 0x00 && cp->val != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (cp->val == hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = send_settings_rsp(sk, MGMT_OP_SET_BREDR, hdev); goto unlock; } if (!hdev_is_powered(hdev)) { if (!cp->val) { hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); hci_dev_clear_flag(hdev, HCI_SSP_ENABLED); hci_dev_clear_flag(hdev, HCI_LINK_SECURITY); hci_dev_clear_flag(hdev, HCI_FAST_CONNECTABLE); } hci_dev_change_flag(hdev, HCI_BREDR_ENABLED); err = send_settings_rsp(sk, MGMT_OP_SET_BREDR, hdev); if (err < 0) goto unlock; err = new_settings(hdev, sk); goto unlock; } /* Reject disabling when powered on */ if (!cp->val) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, MGMT_STATUS_REJECTED); goto unlock; } else { /* When configuring a dual-mode controller to operate * with LE only and using a static address, then switching * BR/EDR back on is not allowed. * * Dual-mode controllers shall operate with the public * address as its identity address for BR/EDR and LE. So * reject the attempt to create an invalid configuration. * * The same restrictions applies when secure connections * has been enabled. For BR/EDR this is a controller feature * while for LE it is a host stack feature. This means that * switching BR/EDR back on when secure connections has been * enabled is not a supported transaction. */ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && (bacmp(&hdev->static_addr, BDADDR_ANY) || hci_dev_test_flag(hdev, HCI_SC_ENABLED))) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, MGMT_STATUS_REJECTED); goto unlock; } } cmd = mgmt_pending_new(sk, MGMT_OP_SET_BREDR, hdev, data, len); if (!cmd) err = -ENOMEM; else err = hci_cmd_sync_queue(hdev, set_bredr_sync, cmd, set_bredr_complete); if (err < 0) { mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, MGMT_STATUS_FAILED); if (cmd) mgmt_pending_free(cmd); goto unlock; } /* We need to flip the bit already here so that * hci_req_update_adv_data generates the correct flags. */ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); unlock: hci_dev_unlock(hdev); return err; } static void set_secure_conn_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; struct mgmt_mode *cp; bt_dev_dbg(hdev, "err %d", err); if (err) { u8 mgmt_err = mgmt_status(err); mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); goto done; } cp = cmd->param; switch (cp->val) { case 0x00: hci_dev_clear_flag(hdev, HCI_SC_ENABLED); hci_dev_clear_flag(hdev, HCI_SC_ONLY); break; case 0x01: hci_dev_set_flag(hdev, HCI_SC_ENABLED); hci_dev_clear_flag(hdev, HCI_SC_ONLY); break; case 0x02: hci_dev_set_flag(hdev, HCI_SC_ENABLED); hci_dev_set_flag(hdev, HCI_SC_ONLY); break; } send_settings_rsp(cmd->sk, cmd->opcode, hdev); new_settings(hdev, cmd->sk); done: mgmt_pending_free(cmd); } static int set_secure_conn_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_mode *cp = cmd->param; u8 val = !!cp->val; /* Force write of val */ hci_dev_set_flag(hdev, HCI_SC_ENABLED); return hci_write_sc_support_sync(hdev, val); } static int set_secure_conn(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; struct mgmt_pending_cmd *cmd; u8 val; int err; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_sc_capable(hdev) && !hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, MGMT_STATUS_NOT_SUPPORTED); if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && lmp_sc_capable(hdev) && !hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, MGMT_STATUS_REJECTED); if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (!hdev_is_powered(hdev) || !lmp_sc_capable(hdev) || !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { bool changed; if (cp->val) { changed = !hci_dev_test_and_set_flag(hdev, HCI_SC_ENABLED); if (cp->val == 0x02) hci_dev_set_flag(hdev, HCI_SC_ONLY); else hci_dev_clear_flag(hdev, HCI_SC_ONLY); } else { changed = hci_dev_test_and_clear_flag(hdev, HCI_SC_ENABLED); hci_dev_clear_flag(hdev, HCI_SC_ONLY); } err = send_settings_rsp(sk, MGMT_OP_SET_SECURE_CONN, hdev); if (err < 0) goto failed; if (changed) err = new_settings(hdev, sk); goto failed; } val = !!cp->val; if (val == hci_dev_test_flag(hdev, HCI_SC_ENABLED) && (cp->val == 0x02) == hci_dev_test_flag(hdev, HCI_SC_ONLY)) { err = send_settings_rsp(sk, MGMT_OP_SET_SECURE_CONN, hdev); goto failed; } cmd = mgmt_pending_new(sk, MGMT_OP_SET_SECURE_CONN, hdev, data, len); if (!cmd) err = -ENOMEM; else err = hci_cmd_sync_queue(hdev, set_secure_conn_sync, cmd, set_secure_conn_complete); if (err < 0) { mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN, MGMT_STATUS_FAILED); if (cmd) mgmt_pending_free(cmd); } failed: hci_dev_unlock(hdev); return err; } static int set_debug_keys(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; bool changed, use_changed; int err; bt_dev_dbg(hdev, "sock %p", sk); if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEBUG_KEYS, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (cp->val) changed = !hci_dev_test_and_set_flag(hdev, HCI_KEEP_DEBUG_KEYS); else changed = hci_dev_test_and_clear_flag(hdev, HCI_KEEP_DEBUG_KEYS); if (cp->val == 0x02) use_changed = !hci_dev_test_and_set_flag(hdev, HCI_USE_DEBUG_KEYS); else use_changed = hci_dev_test_and_clear_flag(hdev, HCI_USE_DEBUG_KEYS); if (hdev_is_powered(hdev) && use_changed && hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) { u8 mode = (cp->val == 0x02) ? 0x01 : 0x00; hci_send_cmd(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE, sizeof(mode), &mode); } err = send_settings_rsp(sk, MGMT_OP_SET_DEBUG_KEYS, hdev); if (err < 0) goto unlock; if (changed) err = new_settings(hdev, sk); unlock: hci_dev_unlock(hdev); return err; } static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data, u16 len) { struct mgmt_cp_set_privacy *cp = cp_data; bool changed; int err; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, MGMT_STATUS_NOT_SUPPORTED); if (cp->privacy != 0x00 && cp->privacy != 0x01 && cp->privacy != 0x02) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, MGMT_STATUS_INVALID_PARAMS); if (hdev_is_powered(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, MGMT_STATUS_REJECTED); hci_dev_lock(hdev); /* If user space supports this command it is also expected to * handle IRKs. Therefore, set the HCI_RPA_RESOLVING flag. */ hci_dev_set_flag(hdev, HCI_RPA_RESOLVING); if (cp->privacy) { changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY); memcpy(hdev->irk, cp->irk, sizeof(hdev->irk)); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); hci_adv_instances_set_rpa_expired(hdev, true); if (cp->privacy == 0x02) hci_dev_set_flag(hdev, HCI_LIMITED_PRIVACY); else hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY); } else { changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY); memset(hdev->irk, 0, sizeof(hdev->irk)); hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED); hci_adv_instances_set_rpa_expired(hdev, false); hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY); } err = send_settings_rsp(sk, MGMT_OP_SET_PRIVACY, hdev); if (err < 0) goto unlock; if (changed) err = new_settings(hdev, sk); unlock: hci_dev_unlock(hdev); return err; } static bool irk_is_valid(struct mgmt_irk_info *irk) { switch (irk->addr.type) { case BDADDR_LE_PUBLIC: return true; case BDADDR_LE_RANDOM: /* Two most significant bits shall be set */ if ((irk->addr.bdaddr.b[5] & 0xc0) != 0xc0) return false; return true; } return false; } static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, u16 len) { struct mgmt_cp_load_irks *cp = cp_data; const u16 max_irk_count = ((U16_MAX - sizeof(*cp)) / sizeof(struct mgmt_irk_info)); u16 irk_count, expected_len; int i, err; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, MGMT_STATUS_NOT_SUPPORTED); irk_count = __le16_to_cpu(cp->irk_count); if (irk_count > max_irk_count) { bt_dev_err(hdev, "load_irks: too big irk_count value %u", irk_count); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, MGMT_STATUS_INVALID_PARAMS); } expected_len = struct_size(cp, irks, irk_count); if (expected_len != len) { bt_dev_err(hdev, "load_irks: expected %u bytes, got %u bytes", expected_len, len); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, MGMT_STATUS_INVALID_PARAMS); } bt_dev_dbg(hdev, "irk_count %u", irk_count); for (i = 0; i < irk_count; i++) { struct mgmt_irk_info *key = &cp->irks[i]; if (!irk_is_valid(key)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, MGMT_STATUS_INVALID_PARAMS); } hci_dev_lock(hdev); hci_smp_irks_clear(hdev); for (i = 0; i < irk_count; i++) { struct mgmt_irk_info *irk = &cp->irks[i]; if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, irk->val)) { bt_dev_warn(hdev, "Skipping blocked IRK for %pMR", &irk->addr.bdaddr); continue; } hci_add_irk(hdev, &irk->addr.bdaddr, le_addr_type(irk->addr.type), irk->val, BDADDR_ANY); } hci_dev_set_flag(hdev, HCI_RPA_RESOLVING); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_IRKS, 0, NULL, 0); hci_dev_unlock(hdev); return err; } static bool ltk_is_valid(struct mgmt_ltk_info *key) { if (key->initiator != 0x00 && key->initiator != 0x01) return false; switch (key->addr.type) { case BDADDR_LE_PUBLIC: return true; case BDADDR_LE_RANDOM: /* Two most significant bits shall be set */ if ((key->addr.bdaddr.b[5] & 0xc0) != 0xc0) return false; return true; } return false; } static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, void *cp_data, u16 len) { struct mgmt_cp_load_long_term_keys *cp = cp_data; const u16 max_key_count = ((U16_MAX - sizeof(*cp)) / sizeof(struct mgmt_ltk_info)); u16 key_count, expected_len; int i, err; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, MGMT_STATUS_NOT_SUPPORTED); key_count = __le16_to_cpu(cp->key_count); if (key_count > max_key_count) { bt_dev_err(hdev, "load_ltks: too big key_count value %u", key_count); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, MGMT_STATUS_INVALID_PARAMS); } expected_len = struct_size(cp, keys, key_count); if (expected_len != len) { bt_dev_err(hdev, "load_keys: expected %u bytes, got %u bytes", expected_len, len); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, MGMT_STATUS_INVALID_PARAMS); } bt_dev_dbg(hdev, "key_count %u", key_count); hci_dev_lock(hdev); hci_smp_ltks_clear(hdev); for (i = 0; i < key_count; i++) { struct mgmt_ltk_info *key = &cp->keys[i]; u8 type, authenticated; if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LTK, key->val)) { bt_dev_warn(hdev, "Skipping blocked LTK for %pMR", &key->addr.bdaddr); continue; } if (!ltk_is_valid(key)) { bt_dev_warn(hdev, "Invalid LTK for %pMR", &key->addr.bdaddr); continue; } switch (key->type) { case MGMT_LTK_UNAUTHENTICATED: authenticated = 0x00; type = key->initiator ? SMP_LTK : SMP_LTK_RESPONDER; break; case MGMT_LTK_AUTHENTICATED: authenticated = 0x01; type = key->initiator ? SMP_LTK : SMP_LTK_RESPONDER; break; case MGMT_LTK_P256_UNAUTH: authenticated = 0x00; type = SMP_LTK_P256; break; case MGMT_LTK_P256_AUTH: authenticated = 0x01; type = SMP_LTK_P256; break; case MGMT_LTK_P256_DEBUG: authenticated = 0x00; type = SMP_LTK_P256_DEBUG; fallthrough; default: continue; } hci_add_ltk(hdev, &key->addr.bdaddr, le_addr_type(key->addr.type), type, authenticated, key->val, key->enc_size, key->ediv, key->rand); } err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, 0, NULL, 0); hci_dev_unlock(hdev); return err; } static void get_conn_info_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; struct hci_conn *conn = cmd->user_data; struct mgmt_cp_get_conn_info *cp = cmd->param; struct mgmt_rp_get_conn_info rp; u8 status; bt_dev_dbg(hdev, "err %d", err); memcpy(&rp.addr, &cp->addr, sizeof(rp.addr)); status = mgmt_status(err); if (status == MGMT_STATUS_SUCCESS) { rp.rssi = conn->rssi; rp.tx_power = conn->tx_power; rp.max_tx_power = conn->max_tx_power; } else { rp.rssi = HCI_RSSI_INVALID; rp.tx_power = HCI_TX_POWER_INVALID; rp.max_tx_power = HCI_TX_POWER_INVALID; } mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_GET_CONN_INFO, status, &rp, sizeof(rp)); mgmt_pending_free(cmd); } static int get_conn_info_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_get_conn_info *cp = cmd->param; struct hci_conn *conn; int err; __le16 handle; /* Make sure we are still connected */ if (cp->addr.type == BDADDR_BREDR) conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr); else conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->addr.bdaddr); if (!conn || conn->state != BT_CONNECTED) return MGMT_STATUS_NOT_CONNECTED; cmd->user_data = conn; handle = cpu_to_le16(conn->handle); /* Refresh RSSI each time */ err = hci_read_rssi_sync(hdev, handle); /* For LE links TX power does not change thus we don't need to * query for it once value is known. */ if (!err && (!bdaddr_type_is_le(cp->addr.type) || conn->tx_power == HCI_TX_POWER_INVALID)) err = hci_read_tx_power_sync(hdev, handle, 0x00); /* Max TX power needs to be read only once per connection */ if (!err && conn->max_tx_power == HCI_TX_POWER_INVALID) err = hci_read_tx_power_sync(hdev, handle, 0x01); return err; } static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_get_conn_info *cp = data; struct mgmt_rp_get_conn_info rp; struct hci_conn *conn; unsigned long conn_info_age; int err = 0; bt_dev_dbg(hdev, "sock %p", sk); memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); rp.addr.type = cp->addr.type; if (!bdaddr_type_is_valid(cp->addr.type)) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, MGMT_STATUS_INVALID_PARAMS, &rp, sizeof(rp)); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); goto unlock; } if (cp->addr.type == BDADDR_BREDR) conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr); else conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->addr.bdaddr); if (!conn || conn->state != BT_CONNECTED) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, MGMT_STATUS_NOT_CONNECTED, &rp, sizeof(rp)); goto unlock; } /* To avoid client trying to guess when to poll again for information we * calculate conn info age as random value between min/max set in hdev. */ conn_info_age = get_random_u32_inclusive(hdev->conn_info_min_age, hdev->conn_info_max_age - 1); /* Query controller to refresh cached values if they are too old or were * never read. */ if (time_after(jiffies, conn->conn_info_timestamp + msecs_to_jiffies(conn_info_age)) || !conn->conn_info_timestamp) { struct mgmt_pending_cmd *cmd; cmd = mgmt_pending_new(sk, MGMT_OP_GET_CONN_INFO, hdev, data, len); if (!cmd) { err = -ENOMEM; } else { err = hci_cmd_sync_queue(hdev, get_conn_info_sync, cmd, get_conn_info_complete); } if (err < 0) { mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, MGMT_STATUS_FAILED, &rp, sizeof(rp)); if (cmd) mgmt_pending_free(cmd); goto unlock; } conn->conn_info_timestamp = jiffies; } else { /* Cache is valid, just reply with values cached in hci_conn */ rp.rssi = conn->rssi; rp.tx_power = conn->tx_power; rp.max_tx_power = conn->max_tx_power; err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); } unlock: hci_dev_unlock(hdev); return err; } static void get_clock_info_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_get_clock_info *cp = cmd->param; struct mgmt_rp_get_clock_info rp; struct hci_conn *conn = cmd->user_data; u8 status = mgmt_status(err); bt_dev_dbg(hdev, "err %d", err); memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); rp.addr.type = cp->addr.type; if (err) goto complete; rp.local_clock = cpu_to_le32(hdev->clock); if (conn) { rp.piconet_clock = cpu_to_le32(conn->clock); rp.accuracy = cpu_to_le16(conn->clock_accuracy); } complete: mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, &rp, sizeof(rp)); mgmt_pending_free(cmd); } static int get_clock_info_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_get_clock_info *cp = cmd->param; struct hci_cp_read_clock hci_cp; struct hci_conn *conn; memset(&hci_cp, 0, sizeof(hci_cp)); hci_read_clock_sync(hdev, &hci_cp); /* Make sure connection still exists */ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr); if (!conn || conn->state != BT_CONNECTED) return MGMT_STATUS_NOT_CONNECTED; cmd->user_data = conn; hci_cp.handle = cpu_to_le16(conn->handle); hci_cp.which = 0x01; /* Piconet clock */ return hci_read_clock_sync(hdev, &hci_cp); } static int get_clock_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_get_clock_info *cp = data; struct mgmt_rp_get_clock_info rp; struct mgmt_pending_cmd *cmd; struct hci_conn *conn; int err; bt_dev_dbg(hdev, "sock %p", sk); memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); rp.addr.type = cp->addr.type; if (cp->addr.type != BDADDR_BREDR) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO, MGMT_STATUS_INVALID_PARAMS, &rp, sizeof(rp)); hci_dev_lock(hdev); if (!hdev_is_powered(hdev)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO, MGMT_STATUS_NOT_POWERED, &rp, sizeof(rp)); goto unlock; } if (bacmp(&cp->addr.bdaddr, BDADDR_ANY)) { conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr); if (!conn || conn->state != BT_CONNECTED) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO, MGMT_STATUS_NOT_CONNECTED, &rp, sizeof(rp)); goto unlock; } } else { conn = NULL; } cmd = mgmt_pending_new(sk, MGMT_OP_GET_CLOCK_INFO, hdev, data, len); if (!cmd) err = -ENOMEM; else err = hci_cmd_sync_queue(hdev, get_clock_info_sync, cmd, get_clock_info_complete); if (err < 0) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO, MGMT_STATUS_FAILED, &rp, sizeof(rp)); if (cmd) mgmt_pending_free(cmd); } unlock: hci_dev_unlock(hdev); return err; } static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) { struct hci_conn *conn; conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr); if (!conn) return false; if (conn->dst_type != type) return false; if (conn->state != BT_CONNECTED) return false; return true; } /* This function requires the caller holds hdev->lock */ static int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type, u8 auto_connect) { struct hci_conn_params *params; params = hci_conn_params_add(hdev, addr, addr_type); if (!params) return -EIO; if (params->auto_connect == auto_connect) return 0; hci_pend_le_list_del_init(params); switch (auto_connect) { case HCI_AUTO_CONN_DISABLED: case HCI_AUTO_CONN_LINK_LOSS: /* If auto connect is being disabled when we're trying to * connect to device, keep connecting. */ if (params->explicit_connect) hci_pend_le_list_add(params, &hdev->pend_le_conns); break; case HCI_AUTO_CONN_REPORT: if (params->explicit_connect) hci_pend_le_list_add(params, &hdev->pend_le_conns); else hci_pend_le_list_add(params, &hdev->pend_le_reports); break; case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: if (!is_connected(hdev, addr, addr_type)) hci_pend_le_list_add(params, &hdev->pend_le_conns); break; } params->auto_connect = auto_connect; bt_dev_dbg(hdev, "addr %pMR (type %u) auto_connect %u", addr, addr_type, auto_connect); return 0; } static void device_added(struct sock *sk, struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type, u8 action) { struct mgmt_ev_device_added ev; bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = type; ev.action = action; mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk); } static void add_device_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_add_device *cp = cmd->param; if (!err) { struct hci_conn_params *params; params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, le_addr_type(cp->addr.type)); device_added(cmd->sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); device_flags_changed(NULL, hdev, &cp->addr.bdaddr, cp->addr.type, hdev->conn_flags, params ? params->flags : 0); } mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_ADD_DEVICE, mgmt_status(err), &cp->addr, sizeof(cp->addr)); mgmt_pending_free(cmd); } static int add_device_sync(struct hci_dev *hdev, void *data) { return hci_update_passive_scan_sync(hdev); } static int add_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_pending_cmd *cmd; struct mgmt_cp_add_device *cp = data; u8 auto_conn, addr_type; struct hci_conn_params *params; int err; u32 current_flags = 0; u32 supported_flags; bt_dev_dbg(hdev, "sock %p", sk); if (!bdaddr_type_is_valid(cp->addr.type) || !bacmp(&cp->addr.bdaddr, BDADDR_ANY)) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); if (cp->action != 0x00 && cp->action != 0x01 && cp->action != 0x02) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); hci_dev_lock(hdev); if (cp->addr.type == BDADDR_BREDR) { /* Only incoming connections action is supported for now */ if (cp->action != 0x01) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); goto unlock; } err = hci_bdaddr_list_add_with_flags(&hdev->accept_list, &cp->addr.bdaddr, cp->addr.type, 0); if (err) goto unlock; hci_update_scan(hdev); goto added; } addr_type = le_addr_type(cp->addr.type); if (cp->action == 0x02) auto_conn = HCI_AUTO_CONN_ALWAYS; else if (cp->action == 0x01) auto_conn = HCI_AUTO_CONN_DIRECT; else auto_conn = HCI_AUTO_CONN_REPORT; /* Kernel internally uses conn_params with resolvable private * address, but Add Device allows only identity addresses. * Make sure it is enforced before calling * hci_conn_params_lookup. */ if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); goto unlock; } /* If the connection parameters don't exist for this device, * they will be created and configured with defaults. */ if (hci_conn_params_set(hdev, &cp->addr.bdaddr, addr_type, auto_conn) < 0) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, MGMT_STATUS_FAILED, &cp->addr, sizeof(cp->addr)); goto unlock; } else { params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, addr_type); if (params) current_flags = params->flags; } cmd = mgmt_pending_new(sk, MGMT_OP_ADD_DEVICE, hdev, data, len); if (!cmd) { err = -ENOMEM; goto unlock; } err = hci_cmd_sync_queue(hdev, add_device_sync, cmd, add_device_complete); if (err < 0) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, MGMT_STATUS_FAILED, &cp->addr, sizeof(cp->addr)); mgmt_pending_free(cmd); } goto unlock; added: device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); supported_flags = hdev->conn_flags; device_flags_changed(NULL, hdev, &cp->addr.bdaddr, cp->addr.type, supported_flags, current_flags); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, MGMT_STATUS_SUCCESS, &cp->addr, sizeof(cp->addr)); unlock: hci_dev_unlock(hdev); return err; } static void device_removed(struct sock *sk, struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) { struct mgmt_ev_device_removed ev; bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = type; mgmt_event(MGMT_EV_DEVICE_REMOVED, hdev, &ev, sizeof(ev), sk); } static int remove_device_sync(struct hci_dev *hdev, void *data) { return hci_update_passive_scan_sync(hdev); } static int remove_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_remove_device *cp = data; int err; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); if (bacmp(&cp->addr.bdaddr, BDADDR_ANY)) { struct hci_conn_params *params; u8 addr_type; if (!bdaddr_type_is_valid(cp->addr.type)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); goto unlock; } if (cp->addr.type == BDADDR_BREDR) { err = hci_bdaddr_list_del(&hdev->accept_list, &cp->addr.bdaddr, cp->addr.type); if (err) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); goto unlock; } hci_update_scan(hdev); device_removed(sk, hdev, &cp->addr.bdaddr, cp->addr.type); goto complete; } addr_type = le_addr_type(cp->addr.type); /* Kernel internally uses conn_params with resolvable private * address, but Remove Device allows only identity addresses. * Make sure it is enforced before calling * hci_conn_params_lookup. */ if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); goto unlock; } params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, addr_type); if (!params) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); goto unlock; } if (params->auto_connect == HCI_AUTO_CONN_DISABLED || params->auto_connect == HCI_AUTO_CONN_EXPLICIT) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); goto unlock; } hci_conn_params_free(params); device_removed(sk, hdev, &cp->addr.bdaddr, cp->addr.type); } else { struct hci_conn_params *p, *tmp; struct bdaddr_list *b, *btmp; if (cp->addr.type) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE, MGMT_STATUS_INVALID_PARAMS, &cp->addr, sizeof(cp->addr)); goto unlock; } list_for_each_entry_safe(b, btmp, &hdev->accept_list, list) { device_removed(sk, hdev, &b->bdaddr, b->bdaddr_type); list_del(&b->list); kfree(b); } hci_update_scan(hdev); list_for_each_entry_safe(p, tmp, &hdev->le_conn_params, list) { if (p->auto_connect == HCI_AUTO_CONN_DISABLED) continue; device_removed(sk, hdev, &p->addr, p->addr_type); if (p->explicit_connect) { p->auto_connect = HCI_AUTO_CONN_EXPLICIT; continue; } hci_conn_params_free(p); } bt_dev_dbg(hdev, "All LE connection parameters were removed"); } hci_cmd_sync_queue(hdev, remove_device_sync, NULL, NULL); complete: err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE, MGMT_STATUS_SUCCESS, &cp->addr, sizeof(cp->addr)); unlock: hci_dev_unlock(hdev); return err; } static int conn_update_sync(struct hci_dev *hdev, void *data) { struct hci_conn_params *params = data; struct hci_conn *conn; conn = hci_conn_hash_lookup_le(hdev, ¶ms->addr, params->addr_type); if (!conn) return -ECANCELED; return hci_le_conn_update_sync(hdev, conn, params); } static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_load_conn_param *cp = data; const u16 max_param_count = ((U16_MAX - sizeof(*cp)) / sizeof(struct mgmt_conn_param)); u16 param_count, expected_len; int i; if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, MGMT_STATUS_NOT_SUPPORTED); param_count = __le16_to_cpu(cp->param_count); if (param_count > max_param_count) { bt_dev_err(hdev, "load_conn_param: too big param_count value %u", param_count); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, MGMT_STATUS_INVALID_PARAMS); } expected_len = struct_size(cp, params, param_count); if (expected_len != len) { bt_dev_err(hdev, "load_conn_param: expected %u bytes, got %u bytes", expected_len, len); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, MGMT_STATUS_INVALID_PARAMS); } bt_dev_dbg(hdev, "param_count %u", param_count); hci_dev_lock(hdev); if (param_count > 1) hci_conn_params_clear_disabled(hdev); for (i = 0; i < param_count; i++) { struct mgmt_conn_param *param = &cp->params[i]; struct hci_conn_params *hci_param; u16 min, max, latency, timeout; bool update = false; u8 addr_type; bt_dev_dbg(hdev, "Adding %pMR (type %u)", ¶m->addr.bdaddr, param->addr.type); if (param->addr.type == BDADDR_LE_PUBLIC) { addr_type = ADDR_LE_DEV_PUBLIC; } else if (param->addr.type == BDADDR_LE_RANDOM) { addr_type = ADDR_LE_DEV_RANDOM; } else { bt_dev_err(hdev, "ignoring invalid connection parameters"); continue; } min = le16_to_cpu(param->min_interval); max = le16_to_cpu(param->max_interval); latency = le16_to_cpu(param->latency); timeout = le16_to_cpu(param->timeout); bt_dev_dbg(hdev, "min 0x%04x max 0x%04x latency 0x%04x timeout 0x%04x", min, max, latency, timeout); if (hci_check_conn_params(min, max, latency, timeout) < 0) { bt_dev_err(hdev, "ignoring invalid connection parameters"); continue; } /* Detect when the loading is for an existing parameter then * attempt to trigger the connection update procedure. */ if (!i && param_count == 1) { hci_param = hci_conn_params_lookup(hdev, ¶m->addr.bdaddr, addr_type); if (hci_param) update = true; else hci_conn_params_clear_disabled(hdev); } hci_param = hci_conn_params_add(hdev, ¶m->addr.bdaddr, addr_type); if (!hci_param) { bt_dev_err(hdev, "failed to add connection parameters"); continue; } hci_param->conn_min_interval = min; hci_param->conn_max_interval = max; hci_param->conn_latency = latency; hci_param->supervision_timeout = timeout; /* Check if we need to trigger a connection update */ if (update) { struct hci_conn *conn; /* Lookup for existing connection as central and check * if parameters match and if they don't then trigger * a connection update. */ conn = hci_conn_hash_lookup_le(hdev, &hci_param->addr, addr_type); if (conn && conn->role == HCI_ROLE_MASTER && (conn->le_conn_min_interval != min || conn->le_conn_max_interval != max || conn->le_conn_latency != latency || conn->le_supv_timeout != timeout)) hci_cmd_sync_queue(hdev, conn_update_sync, hci_param, NULL); } } hci_dev_unlock(hdev); return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_LOAD_CONN_PARAM, 0, NULL, 0); } static int set_external_config(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_external_config *cp = data; bool changed; int err; bt_dev_dbg(hdev, "sock %p", sk); if (hdev_is_powered(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, MGMT_STATUS_REJECTED); if (cp->config != 0x00 && cp->config != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, MGMT_STATUS_INVALID_PARAMS); if (!test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, MGMT_STATUS_NOT_SUPPORTED); hci_dev_lock(hdev); if (cp->config) changed = !hci_dev_test_and_set_flag(hdev, HCI_EXT_CONFIGURED); else changed = hci_dev_test_and_clear_flag(hdev, HCI_EXT_CONFIGURED); err = send_options_rsp(sk, MGMT_OP_SET_EXTERNAL_CONFIG, hdev); if (err < 0) goto unlock; if (!changed) goto unlock; err = new_options(hdev, sk); if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) == is_configured(hdev)) { mgmt_index_removed(hdev); if (hci_dev_test_and_change_flag(hdev, HCI_UNCONFIGURED)) { hci_dev_set_flag(hdev, HCI_CONFIG); hci_dev_set_flag(hdev, HCI_AUTO_OFF); queue_work(hdev->req_workqueue, &hdev->power_on); } else { set_bit(HCI_RAW, &hdev->flags); mgmt_index_added(hdev); } } unlock: hci_dev_unlock(hdev); return err; } static int set_public_address(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_set_public_address *cp = data; bool changed; int err; bt_dev_dbg(hdev, "sock %p", sk); if (hdev_is_powered(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, MGMT_STATUS_REJECTED); if (!bacmp(&cp->bdaddr, BDADDR_ANY)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, MGMT_STATUS_INVALID_PARAMS); if (!hdev->set_bdaddr) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, MGMT_STATUS_NOT_SUPPORTED); hci_dev_lock(hdev); changed = !!bacmp(&hdev->public_addr, &cp->bdaddr); bacpy(&hdev->public_addr, &cp->bdaddr); err = send_options_rsp(sk, MGMT_OP_SET_PUBLIC_ADDRESS, hdev); if (err < 0) goto unlock; if (!changed) goto unlock; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) err = new_options(hdev, sk); if (is_configured(hdev)) { mgmt_index_removed(hdev); hci_dev_clear_flag(hdev, HCI_UNCONFIGURED); hci_dev_set_flag(hdev, HCI_CONFIG); hci_dev_set_flag(hdev, HCI_AUTO_OFF); queue_work(hdev->req_workqueue, &hdev->power_on); } unlock: hci_dev_unlock(hdev); return err; } static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data, int err) { const struct mgmt_cp_read_local_oob_ext_data *mgmt_cp; struct mgmt_rp_read_local_oob_ext_data *mgmt_rp; u8 *h192, *r192, *h256, *r256; struct mgmt_pending_cmd *cmd = data; struct sk_buff *skb = cmd->skb; u8 status = mgmt_status(err); u16 eir_len; if (err == -ECANCELED || cmd != pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev)) return; if (!status) { if (!skb) status = MGMT_STATUS_FAILED; else if (IS_ERR(skb)) status = mgmt_status(PTR_ERR(skb)); else status = mgmt_status(skb->data[0]); } bt_dev_dbg(hdev, "status %u", status); mgmt_cp = cmd->param; if (status) { status = mgmt_status(status); eir_len = 0; h192 = NULL; r192 = NULL; h256 = NULL; r256 = NULL; } else if (!bredr_sc_enabled(hdev)) { struct hci_rp_read_local_oob_data *rp; if (skb->len != sizeof(*rp)) { status = MGMT_STATUS_FAILED; eir_len = 0; } else { status = MGMT_STATUS_SUCCESS; rp = (void *)skb->data; eir_len = 5 + 18 + 18; h192 = rp->hash; r192 = rp->rand; h256 = NULL; r256 = NULL; } } else { struct hci_rp_read_local_oob_ext_data *rp; if (skb->len != sizeof(*rp)) { status = MGMT_STATUS_FAILED; eir_len = 0; } else { status = MGMT_STATUS_SUCCESS; rp = (void *)skb->data; if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) { eir_len = 5 + 18 + 18; h192 = NULL; r192 = NULL; } else { eir_len = 5 + 18 + 18 + 18 + 18; h192 = rp->hash192; r192 = rp->rand192; } h256 = rp->hash256; r256 = rp->rand256; } } mgmt_rp = kmalloc(sizeof(*mgmt_rp) + eir_len, GFP_KERNEL); if (!mgmt_rp) goto done; if (eir_len == 0) goto send_rsp; eir_len = eir_append_data(mgmt_rp->eir, 0, EIR_CLASS_OF_DEV, hdev->dev_class, 3); if (h192 && r192) { eir_len = eir_append_data(mgmt_rp->eir, eir_len, EIR_SSP_HASH_C192, h192, 16); eir_len = eir_append_data(mgmt_rp->eir, eir_len, EIR_SSP_RAND_R192, r192, 16); } if (h256 && r256) { eir_len = eir_append_data(mgmt_rp->eir, eir_len, EIR_SSP_HASH_C256, h256, 16); eir_len = eir_append_data(mgmt_rp->eir, eir_len, EIR_SSP_RAND_R256, r256, 16); } send_rsp: mgmt_rp->type = mgmt_cp->type; mgmt_rp->eir_len = cpu_to_le16(eir_len); err = mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, status, mgmt_rp, sizeof(*mgmt_rp) + eir_len); if (err < 0 || status) goto done; hci_sock_set_flag(cmd->sk, HCI_MGMT_OOB_DATA_EVENTS); err = mgmt_limited_event(MGMT_EV_LOCAL_OOB_DATA_UPDATED, hdev, mgmt_rp, sizeof(*mgmt_rp) + eir_len, HCI_MGMT_OOB_DATA_EVENTS, cmd->sk); done: if (skb && !IS_ERR(skb)) kfree_skb(skb); kfree(mgmt_rp); mgmt_pending_remove(cmd); } static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk, struct mgmt_cp_read_local_oob_ext_data *cp) { struct mgmt_pending_cmd *cmd; int err; cmd = mgmt_pending_add(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev, cp, sizeof(*cp)); if (!cmd) return -ENOMEM; err = hci_cmd_sync_queue(hdev, read_local_oob_data_sync, cmd, read_local_oob_ext_data_complete); if (err < 0) { mgmt_pending_remove(cmd); return err; } return 0; } static int read_local_oob_ext_data(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_cp_read_local_oob_ext_data *cp = data; struct mgmt_rp_read_local_oob_ext_data *rp; size_t rp_len; u16 eir_len; u8 status, flags, role, addr[7], hash[16], rand[16]; int err; bt_dev_dbg(hdev, "sock %p", sk); if (hdev_is_powered(hdev)) { switch (cp->type) { case BIT(BDADDR_BREDR): status = mgmt_bredr_support(hdev); if (status) eir_len = 0; else eir_len = 5; break; case (BIT(BDADDR_LE_PUBLIC) | BIT(BDADDR_LE_RANDOM)): status = mgmt_le_support(hdev); if (status) eir_len = 0; else eir_len = 9 + 3 + 18 + 18 + 3; break; default: status = MGMT_STATUS_INVALID_PARAMS; eir_len = 0; break; } } else { status = MGMT_STATUS_NOT_POWERED; eir_len = 0; } rp_len = sizeof(*rp) + eir_len; rp = kmalloc(rp_len, GFP_ATOMIC); if (!rp) return -ENOMEM; if (!status && !lmp_ssp_capable(hdev)) { status = MGMT_STATUS_NOT_SUPPORTED; eir_len = 0; } if (status) goto complete; hci_dev_lock(hdev); eir_len = 0; switch (cp->type) { case BIT(BDADDR_BREDR): if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) { err = read_local_ssp_oob_req(hdev, sk, cp); hci_dev_unlock(hdev); if (!err) goto done; status = MGMT_STATUS_FAILED; goto complete; } else { eir_len = eir_append_data(rp->eir, eir_len, EIR_CLASS_OF_DEV, hdev->dev_class, 3); } break; case (BIT(BDADDR_LE_PUBLIC) | BIT(BDADDR_LE_RANDOM)): if (hci_dev_test_flag(hdev, HCI_SC_ENABLED) && smp_generate_oob(hdev, hash, rand) < 0) { hci_dev_unlock(hdev); status = MGMT_STATUS_FAILED; goto complete; } /* This should return the active RPA, but since the RPA * is only programmed on demand, it is really hard to fill * this in at the moment. For now disallow retrieving * local out-of-band data when privacy is in use. * * Returning the identity address will not help here since * pairing happens before the identity resolving key is * known and thus the connection establishment happens * based on the RPA and not the identity address. */ if (hci_dev_test_flag(hdev, HCI_PRIVACY)) { hci_dev_unlock(hdev); status = MGMT_STATUS_REJECTED; goto complete; } if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || !bacmp(&hdev->bdaddr, BDADDR_ANY) || (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && bacmp(&hdev->static_addr, BDADDR_ANY))) { memcpy(addr, &hdev->static_addr, 6); addr[6] = 0x01; } else { memcpy(addr, &hdev->bdaddr, 6); addr[6] = 0x00; } eir_len = eir_append_data(rp->eir, eir_len, EIR_LE_BDADDR, addr, sizeof(addr)); if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) role = 0x02; else role = 0x01; eir_len = eir_append_data(rp->eir, eir_len, EIR_LE_ROLE, &role, sizeof(role)); if (hci_dev_test_flag(hdev, HCI_SC_ENABLED)) { eir_len = eir_append_data(rp->eir, eir_len, EIR_LE_SC_CONFIRM, hash, sizeof(hash)); eir_len = eir_append_data(rp->eir, eir_len, EIR_LE_SC_RANDOM, rand, sizeof(rand)); } flags = mgmt_get_adv_discov_flags(hdev); if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) flags |= LE_AD_NO_BREDR; eir_len = eir_append_data(rp->eir, eir_len, EIR_FLAGS, &flags, sizeof(flags)); break; } hci_dev_unlock(hdev); hci_sock_set_flag(sk, HCI_MGMT_OOB_DATA_EVENTS); status = MGMT_STATUS_SUCCESS; complete: rp->type = cp->type; rp->eir_len = cpu_to_le16(eir_len); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, status, rp, sizeof(*rp) + eir_len); if (err < 0 || status) goto done; err = mgmt_limited_event(MGMT_EV_LOCAL_OOB_DATA_UPDATED, hdev, rp, sizeof(*rp) + eir_len, HCI_MGMT_OOB_DATA_EVENTS, sk); done: kfree(rp); return err; } static u32 get_supported_adv_flags(struct hci_dev *hdev) { u32 flags = 0; flags |= MGMT_ADV_FLAG_CONNECTABLE; flags |= MGMT_ADV_FLAG_DISCOV; flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; flags |= MGMT_ADV_FLAG_MANAGED_FLAGS; flags |= MGMT_ADV_FLAG_APPEARANCE; flags |= MGMT_ADV_FLAG_LOCAL_NAME; flags |= MGMT_ADV_PARAM_DURATION; flags |= MGMT_ADV_PARAM_TIMEOUT; flags |= MGMT_ADV_PARAM_INTERVALS; flags |= MGMT_ADV_PARAM_TX_POWER; flags |= MGMT_ADV_PARAM_SCAN_RSP; /* In extended adv TX_POWER returned from Set Adv Param * will be always valid. */ if (hdev->adv_tx_power != HCI_TX_POWER_INVALID || ext_adv_capable(hdev)) flags |= MGMT_ADV_FLAG_TX_POWER; if (ext_adv_capable(hdev)) { flags |= MGMT_ADV_FLAG_SEC_1M; flags |= MGMT_ADV_FLAG_HW_OFFLOAD; flags |= MGMT_ADV_FLAG_CAN_SET_TX_POWER; if (le_2m_capable(hdev)) flags |= MGMT_ADV_FLAG_SEC_2M; if (le_coded_capable(hdev)) flags |= MGMT_ADV_FLAG_SEC_CODED; } return flags; } static int read_adv_features(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_rp_read_adv_features *rp; size_t rp_len; int err; struct adv_info *adv_instance; u32 supported_flags; u8 *instance; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES, MGMT_STATUS_REJECTED); hci_dev_lock(hdev); rp_len = sizeof(*rp) + hdev->adv_instance_cnt; rp = kmalloc(rp_len, GFP_ATOMIC); if (!rp) { hci_dev_unlock(hdev); return -ENOMEM; } supported_flags = get_supported_adv_flags(hdev); rp->supported_flags = cpu_to_le32(supported_flags); rp->max_adv_data_len = max_adv_len(hdev); rp->max_scan_rsp_len = max_adv_len(hdev); rp->max_instances = hdev->le_num_of_adv_sets; rp->num_instances = hdev->adv_instance_cnt; instance = rp->instance; list_for_each_entry(adv_instance, &hdev->adv_instances, list) { /* Only instances 1-le_num_of_adv_sets are externally visible */ if (adv_instance->instance <= hdev->adv_instance_cnt) { *instance = adv_instance->instance; instance++; } else { rp->num_instances--; rp_len--; } } hci_dev_unlock(hdev); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES, MGMT_STATUS_SUCCESS, rp, rp_len); kfree(rp); return err; } static u8 calculate_name_len(struct hci_dev *hdev) { u8 buf[HCI_MAX_SHORT_NAME_LENGTH + 2]; /* len + type + name */ return eir_append_local_name(hdev, buf, 0); } static u8 tlv_data_max_len(struct hci_dev *hdev, u32 adv_flags, bool is_adv_data) { u8 max_len = max_adv_len(hdev); if (is_adv_data) { if (adv_flags & (MGMT_ADV_FLAG_DISCOV | MGMT_ADV_FLAG_LIMITED_DISCOV | MGMT_ADV_FLAG_MANAGED_FLAGS)) max_len -= 3; if (adv_flags & MGMT_ADV_FLAG_TX_POWER) max_len -= 3; } else { if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME) max_len -= calculate_name_len(hdev); if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE)) max_len -= 4; } return max_len; } static bool flags_managed(u32 adv_flags) { return adv_flags & (MGMT_ADV_FLAG_DISCOV | MGMT_ADV_FLAG_LIMITED_DISCOV | MGMT_ADV_FLAG_MANAGED_FLAGS); } static bool tx_power_managed(u32 adv_flags) { return adv_flags & MGMT_ADV_FLAG_TX_POWER; } static bool name_managed(u32 adv_flags) { return adv_flags & MGMT_ADV_FLAG_LOCAL_NAME; } static bool appearance_managed(u32 adv_flags) { return adv_flags & MGMT_ADV_FLAG_APPEARANCE; } static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, u8 len, bool is_adv_data) { int i, cur_len; u8 max_len; max_len = tlv_data_max_len(hdev, adv_flags, is_adv_data); if (len > max_len) return false; /* Make sure that the data is correctly formatted. */ for (i = 0; i < len; i += (cur_len + 1)) { cur_len = data[i]; if (!cur_len) continue; if (data[i + 1] == EIR_FLAGS && (!is_adv_data || flags_managed(adv_flags))) return false; if (data[i + 1] == EIR_TX_POWER && tx_power_managed(adv_flags)) return false; if (data[i + 1] == EIR_NAME_COMPLETE && name_managed(adv_flags)) return false; if (data[i + 1] == EIR_NAME_SHORT && name_managed(adv_flags)) return false; if (data[i + 1] == EIR_APPEARANCE && appearance_managed(adv_flags)) return false; /* If the current field length would exceed the total data * length, then it's invalid. */ if (i + cur_len >= len) return false; } return true; } static bool requested_adv_flags_are_valid(struct hci_dev *hdev, u32 adv_flags) { u32 supported_flags, phy_flags; /* The current implementation only supports a subset of the specified * flags. Also need to check mutual exclusiveness of sec flags. */ supported_flags = get_supported_adv_flags(hdev); phy_flags = adv_flags & MGMT_ADV_FLAG_SEC_MASK; if (adv_flags & ~supported_flags || ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags))))) return false; return true; } static bool adv_busy(struct hci_dev *hdev) { return pending_find(MGMT_OP_SET_LE, hdev); } static void add_adv_complete(struct hci_dev *hdev, struct sock *sk, u8 instance, int err) { struct adv_info *adv, *n; bt_dev_dbg(hdev, "err %d", err); hci_dev_lock(hdev); list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) { u8 instance; if (!adv->pending) continue; if (!err) { adv->pending = false; continue; } instance = adv->instance; if (hdev->cur_adv_instance == instance) cancel_adv_timeout(hdev); hci_remove_adv_instance(hdev, instance); mgmt_advertising_removed(sk, hdev, instance); } hci_dev_unlock(hdev); } static void add_advertising_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_add_advertising *cp = cmd->param; struct mgmt_rp_add_advertising rp; memset(&rp, 0, sizeof(rp)); rp.instance = cp->instance; if (err) mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err)); else mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), &rp, sizeof(rp)); add_adv_complete(hdev, cmd->sk, cp->instance, err); mgmt_pending_free(cmd); } static int add_advertising_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_add_advertising *cp = cmd->param; return hci_schedule_adv_instance_sync(hdev, cp->instance, true); } static int add_advertising(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_cp_add_advertising *cp = data; struct mgmt_rp_add_advertising rp; u32 flags; u8 status; u16 timeout, duration; unsigned int prev_instance_cnt; u8 schedule_instance = 0; struct adv_info *adv, *next_instance; int err; struct mgmt_pending_cmd *cmd; bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_le_support(hdev); if (status) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, status); if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); if (data_len != sizeof(*cp) + cp->adv_data_len + cp->scan_rsp_len) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); flags = __le32_to_cpu(cp->flags); timeout = __le16_to_cpu(cp->timeout); duration = __le16_to_cpu(cp->duration); if (!requested_adv_flags_are_valid(hdev, flags)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); if (timeout && !hdev_is_powered(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_REJECTED); goto unlock; } if (adv_busy(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_BUSY); goto unlock; } if (!tlv_data_is_valid(hdev, flags, cp->data, cp->adv_data_len, true) || !tlv_data_is_valid(hdev, flags, cp->data + cp->adv_data_len, cp->scan_rsp_len, false)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); goto unlock; } prev_instance_cnt = hdev->adv_instance_cnt; adv = hci_add_adv_instance(hdev, cp->instance, flags, cp->adv_data_len, cp->data, cp->scan_rsp_len, cp->data + cp->adv_data_len, timeout, duration, HCI_ADV_TX_POWER_NO_PREFERENCE, hdev->le_adv_min_interval, hdev->le_adv_max_interval, 0); if (IS_ERR(adv)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_FAILED); goto unlock; } /* Only trigger an advertising added event if a new instance was * actually added. */ if (hdev->adv_instance_cnt > prev_instance_cnt) mgmt_advertising_added(sk, hdev, cp->instance); if (hdev->cur_adv_instance == cp->instance) { /* If the currently advertised instance is being changed then * cancel the current advertising and schedule the next * instance. If there is only one instance then the overridden * advertising data will be visible right away. */ cancel_adv_timeout(hdev); next_instance = hci_get_next_instance(hdev, cp->instance); if (next_instance) schedule_instance = next_instance->instance; } else if (!hdev->adv_instance_timeout) { /* Immediately advertise the new instance if no other * instance is currently being advertised. */ schedule_instance = cp->instance; } /* If the HCI_ADVERTISING flag is set or the device isn't powered or * there is no instance to be advertised then we have no HCI * communication to make. Simply return. */ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING) || !schedule_instance) { rp.instance = cp->instance; err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); goto unlock; } /* We're good to go, update advertising data, parameters, and start * advertising. */ cmd = mgmt_pending_new(sk, MGMT_OP_ADD_ADVERTISING, hdev, data, data_len); if (!cmd) { err = -ENOMEM; goto unlock; } cp->instance = schedule_instance; err = hci_cmd_sync_queue(hdev, add_advertising_sync, cmd, add_advertising_complete); if (err < 0) mgmt_pending_free(cmd); unlock: hci_dev_unlock(hdev); return err; } static void add_ext_adv_params_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_add_ext_adv_params *cp = cmd->param; struct mgmt_rp_add_ext_adv_params rp; struct adv_info *adv; u32 flags; BT_DBG("%s", hdev->name); hci_dev_lock(hdev); adv = hci_find_adv_instance(hdev, cp->instance); if (!adv) goto unlock; rp.instance = cp->instance; rp.tx_power = adv->tx_power; /* While we're at it, inform userspace of the available space for this * advertisement, given the flags that will be used. */ flags = __le32_to_cpu(cp->flags); rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true); rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false); if (err) { /* If this advertisement was previously advertising and we * failed to update it, we signal that it has been removed and * delete its structure */ if (!adv->pending) mgmt_advertising_removed(cmd->sk, hdev, cp->instance); hci_remove_adv_instance(hdev, cp->instance); mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err)); } else { mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), &rp, sizeof(rp)); } unlock: mgmt_pending_free(cmd); hci_dev_unlock(hdev); } static int add_ext_adv_params_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_add_ext_adv_params *cp = cmd->param; return hci_setup_ext_adv_instance_sync(hdev, cp->instance); } static int add_ext_adv_params(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_cp_add_ext_adv_params *cp = data; struct mgmt_rp_add_ext_adv_params rp; struct mgmt_pending_cmd *cmd = NULL; struct adv_info *adv; u32 flags, min_interval, max_interval; u16 timeout, duration; u8 status; s8 tx_power; int err; BT_DBG("%s", hdev->name); status = mgmt_le_support(hdev); if (status) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, status); if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, MGMT_STATUS_INVALID_PARAMS); /* The purpose of breaking add_advertising into two separate MGMT calls * for params and data is to allow more parameters to be added to this * structure in the future. For this reason, we verify that we have the * bare minimum structure we know of when the interface was defined. Any * extra parameters we don't know about will be ignored in this request. */ if (data_len < MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, MGMT_STATUS_INVALID_PARAMS); flags = __le32_to_cpu(cp->flags); if (!requested_adv_flags_are_valid(hdev, flags)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); /* In new interface, we require that we are powered to register */ if (!hdev_is_powered(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, MGMT_STATUS_REJECTED); goto unlock; } if (adv_busy(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, MGMT_STATUS_BUSY); goto unlock; } /* Parse defined parameters from request, use defaults otherwise */ timeout = (flags & MGMT_ADV_PARAM_TIMEOUT) ? __le16_to_cpu(cp->timeout) : 0; duration = (flags & MGMT_ADV_PARAM_DURATION) ? __le16_to_cpu(cp->duration) : hdev->def_multi_adv_rotation_duration; min_interval = (flags & MGMT_ADV_PARAM_INTERVALS) ? __le32_to_cpu(cp->min_interval) : hdev->le_adv_min_interval; max_interval = (flags & MGMT_ADV_PARAM_INTERVALS) ? __le32_to_cpu(cp->max_interval) : hdev->le_adv_max_interval; tx_power = (flags & MGMT_ADV_PARAM_TX_POWER) ? cp->tx_power : HCI_ADV_TX_POWER_NO_PREFERENCE; /* Create advertising instance with no advertising or response data */ adv = hci_add_adv_instance(hdev, cp->instance, flags, 0, NULL, 0, NULL, timeout, duration, tx_power, min_interval, max_interval, 0); if (IS_ERR(adv)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, MGMT_STATUS_FAILED); goto unlock; } /* Submit request for advertising params if ext adv available */ if (ext_adv_capable(hdev)) { cmd = mgmt_pending_new(sk, MGMT_OP_ADD_EXT_ADV_PARAMS, hdev, data, data_len); if (!cmd) { err = -ENOMEM; hci_remove_adv_instance(hdev, cp->instance); goto unlock; } err = hci_cmd_sync_queue(hdev, add_ext_adv_params_sync, cmd, add_ext_adv_params_complete); if (err < 0) mgmt_pending_free(cmd); } else { rp.instance = cp->instance; rp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE; rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true); rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false); err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); } unlock: hci_dev_unlock(hdev); return err; } static void add_ext_adv_data_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_add_ext_adv_data *cp = cmd->param; struct mgmt_rp_add_advertising rp; add_adv_complete(hdev, cmd->sk, cp->instance, err); memset(&rp, 0, sizeof(rp)); rp.instance = cp->instance; if (err) mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err)); else mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), &rp, sizeof(rp)); mgmt_pending_free(cmd); } static int add_ext_adv_data_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_add_ext_adv_data *cp = cmd->param; int err; if (ext_adv_capable(hdev)) { err = hci_update_adv_data_sync(hdev, cp->instance); if (err) return err; err = hci_update_scan_rsp_data_sync(hdev, cp->instance); if (err) return err; return hci_enable_ext_advertising_sync(hdev, cp->instance); } return hci_schedule_adv_instance_sync(hdev, cp->instance, true); } static int add_ext_adv_data(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_cp_add_ext_adv_data *cp = data; struct mgmt_rp_add_ext_adv_data rp; u8 schedule_instance = 0; struct adv_info *next_instance; struct adv_info *adv_instance; int err = 0; struct mgmt_pending_cmd *cmd; BT_DBG("%s", hdev->name); hci_dev_lock(hdev); adv_instance = hci_find_adv_instance(hdev, cp->instance); if (!adv_instance) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA, MGMT_STATUS_INVALID_PARAMS); goto unlock; } /* In new interface, we require that we are powered to register */ if (!hdev_is_powered(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA, MGMT_STATUS_REJECTED); goto clear_new_instance; } if (adv_busy(hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA, MGMT_STATUS_BUSY); goto clear_new_instance; } /* Validate new data */ if (!tlv_data_is_valid(hdev, adv_instance->flags, cp->data, cp->adv_data_len, true) || !tlv_data_is_valid(hdev, adv_instance->flags, cp->data + cp->adv_data_len, cp->scan_rsp_len, false)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA, MGMT_STATUS_INVALID_PARAMS); goto clear_new_instance; } /* Set the data in the advertising instance */ hci_set_adv_instance_data(hdev, cp->instance, cp->adv_data_len, cp->data, cp->scan_rsp_len, cp->data + cp->adv_data_len); /* If using software rotation, determine next instance to use */ if (hdev->cur_adv_instance == cp->instance) { /* If the currently advertised instance is being changed * then cancel the current advertising and schedule the * next instance. If there is only one instance then the * overridden advertising data will be visible right * away */ cancel_adv_timeout(hdev); next_instance = hci_get_next_instance(hdev, cp->instance); if (next_instance) schedule_instance = next_instance->instance; } else if (!hdev->adv_instance_timeout) { /* Immediately advertise the new instance if no other * instance is currently being advertised. */ schedule_instance = cp->instance; } /* If the HCI_ADVERTISING flag is set or there is no instance to * be advertised then we have no HCI communication to make. * Simply return. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || !schedule_instance) { if (adv_instance->pending) { mgmt_advertising_added(sk, hdev, cp->instance); adv_instance->pending = false; } rp.instance = cp->instance; err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); goto unlock; } cmd = mgmt_pending_new(sk, MGMT_OP_ADD_EXT_ADV_DATA, hdev, data, data_len); if (!cmd) { err = -ENOMEM; goto clear_new_instance; } err = hci_cmd_sync_queue(hdev, add_ext_adv_data_sync, cmd, add_ext_adv_data_complete); if (err < 0) { mgmt_pending_free(cmd); goto clear_new_instance; } /* We were successful in updating data, so trigger advertising_added * event if this is an instance that wasn't previously advertising. If * a failure occurs in the requests we initiated, we will remove the * instance again in add_advertising_complete */ if (adv_instance->pending) mgmt_advertising_added(sk, hdev, cp->instance); goto unlock; clear_new_instance: hci_remove_adv_instance(hdev, cp->instance); unlock: hci_dev_unlock(hdev); return err; } static void remove_advertising_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_remove_advertising *cp = cmd->param; struct mgmt_rp_remove_advertising rp; bt_dev_dbg(hdev, "err %d", err); memset(&rp, 0, sizeof(rp)); rp.instance = cp->instance; if (err) mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err)); else mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); mgmt_pending_free(cmd); } static int remove_advertising_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_remove_advertising *cp = cmd->param; int err; err = hci_remove_advertising_sync(hdev, cmd->sk, cp->instance, true); if (err) return err; if (list_empty(&hdev->adv_instances)) err = hci_disable_advertising_sync(hdev); return err; } static int remove_advertising(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_cp_remove_advertising *cp = data; struct mgmt_pending_cmd *cmd; int err; bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); if (cp->instance && !hci_find_adv_instance(hdev, cp->instance)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); goto unlock; } if (pending_find(MGMT_OP_SET_LE, hdev)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_BUSY); goto unlock; } if (list_empty(&hdev->adv_instances)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); goto unlock; } cmd = mgmt_pending_new(sk, MGMT_OP_REMOVE_ADVERTISING, hdev, data, data_len); if (!cmd) { err = -ENOMEM; goto unlock; } err = hci_cmd_sync_queue(hdev, remove_advertising_sync, cmd, remove_advertising_complete); if (err < 0) mgmt_pending_free(cmd); unlock: hci_dev_unlock(hdev); return err; } static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { struct mgmt_cp_get_adv_size_info *cp = data; struct mgmt_rp_get_adv_size_info rp; u32 flags, supported_flags; bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, MGMT_STATUS_REJECTED); if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, MGMT_STATUS_INVALID_PARAMS); flags = __le32_to_cpu(cp->flags); /* The current implementation only supports a subset of the specified * flags. */ supported_flags = get_supported_adv_flags(hdev); if (flags & ~supported_flags) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, MGMT_STATUS_INVALID_PARAMS); rp.instance = cp->instance; rp.flags = cp->flags; rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true); rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false); return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); } static const struct hci_mgmt_handler mgmt_handlers[] = { { NULL }, /* 0x0000 (no command) */ { read_version, MGMT_READ_VERSION_SIZE, HCI_MGMT_NO_HDEV | HCI_MGMT_UNTRUSTED }, { read_commands, MGMT_READ_COMMANDS_SIZE, HCI_MGMT_NO_HDEV | HCI_MGMT_UNTRUSTED }, { read_index_list, MGMT_READ_INDEX_LIST_SIZE, HCI_MGMT_NO_HDEV | HCI_MGMT_UNTRUSTED }, { read_controller_info, MGMT_READ_INFO_SIZE, HCI_MGMT_UNTRUSTED }, { set_powered, MGMT_SETTING_SIZE }, { set_discoverable, MGMT_SET_DISCOVERABLE_SIZE }, { set_connectable, MGMT_SETTING_SIZE }, { set_fast_connectable, MGMT_SETTING_SIZE }, { set_bondable, MGMT_SETTING_SIZE }, { set_link_security, MGMT_SETTING_SIZE }, { set_ssp, MGMT_SETTING_SIZE }, { set_hs, MGMT_SETTING_SIZE }, { set_le, MGMT_SETTING_SIZE }, { set_dev_class, MGMT_SET_DEV_CLASS_SIZE }, { set_local_name, MGMT_SET_LOCAL_NAME_SIZE }, { add_uuid, MGMT_ADD_UUID_SIZE }, { remove_uuid, MGMT_REMOVE_UUID_SIZE }, { load_link_keys, MGMT_LOAD_LINK_KEYS_SIZE, HCI_MGMT_VAR_LEN }, { load_long_term_keys, MGMT_LOAD_LONG_TERM_KEYS_SIZE, HCI_MGMT_VAR_LEN }, { disconnect, MGMT_DISCONNECT_SIZE }, { get_connections, MGMT_GET_CONNECTIONS_SIZE }, { pin_code_reply, MGMT_PIN_CODE_REPLY_SIZE }, { pin_code_neg_reply, MGMT_PIN_CODE_NEG_REPLY_SIZE }, { set_io_capability, MGMT_SET_IO_CAPABILITY_SIZE }, { pair_device, MGMT_PAIR_DEVICE_SIZE }, { cancel_pair_device, MGMT_CANCEL_PAIR_DEVICE_SIZE }, { unpair_device, MGMT_UNPAIR_DEVICE_SIZE }, { user_confirm_reply, MGMT_USER_CONFIRM_REPLY_SIZE }, { user_confirm_neg_reply, MGMT_USER_CONFIRM_NEG_REPLY_SIZE }, { user_passkey_reply, MGMT_USER_PASSKEY_REPLY_SIZE }, { user_passkey_neg_reply, MGMT_USER_PASSKEY_NEG_REPLY_SIZE }, { read_local_oob_data, MGMT_READ_LOCAL_OOB_DATA_SIZE }, { add_remote_oob_data, MGMT_ADD_REMOTE_OOB_DATA_SIZE, HCI_MGMT_VAR_LEN }, { remove_remote_oob_data, MGMT_REMOVE_REMOTE_OOB_DATA_SIZE }, { start_discovery, MGMT_START_DISCOVERY_SIZE }, { stop_discovery, MGMT_STOP_DISCOVERY_SIZE }, { confirm_name, MGMT_CONFIRM_NAME_SIZE }, { block_device, MGMT_BLOCK_DEVICE_SIZE }, { unblock_device, MGMT_UNBLOCK_DEVICE_SIZE }, { set_device_id, MGMT_SET_DEVICE_ID_SIZE }, { set_advertising, MGMT_SETTING_SIZE }, { set_bredr, MGMT_SETTING_SIZE }, { set_static_address, MGMT_SET_STATIC_ADDRESS_SIZE }, { set_scan_params, MGMT_SET_SCAN_PARAMS_SIZE }, { set_secure_conn, MGMT_SETTING_SIZE }, { set_debug_keys, MGMT_SETTING_SIZE }, { set_privacy, MGMT_SET_PRIVACY_SIZE }, { load_irks, MGMT_LOAD_IRKS_SIZE, HCI_MGMT_VAR_LEN }, { get_conn_info, MGMT_GET_CONN_INFO_SIZE }, { get_clock_info, MGMT_GET_CLOCK_INFO_SIZE }, { add_device, MGMT_ADD_DEVICE_SIZE }, { remove_device, MGMT_REMOVE_DEVICE_SIZE }, { load_conn_param, MGMT_LOAD_CONN_PARAM_SIZE, HCI_MGMT_VAR_LEN }, { read_unconf_index_list, MGMT_READ_UNCONF_INDEX_LIST_SIZE, HCI_MGMT_NO_HDEV | HCI_MGMT_UNTRUSTED }, { read_config_info, MGMT_READ_CONFIG_INFO_SIZE, HCI_MGMT_UNCONFIGURED | HCI_MGMT_UNTRUSTED }, { set_external_config, MGMT_SET_EXTERNAL_CONFIG_SIZE, HCI_MGMT_UNCONFIGURED }, { set_public_address, MGMT_SET_PUBLIC_ADDRESS_SIZE, HCI_MGMT_UNCONFIGURED }, { start_service_discovery, MGMT_START_SERVICE_DISCOVERY_SIZE, HCI_MGMT_VAR_LEN }, { read_local_oob_ext_data, MGMT_READ_LOCAL_OOB_EXT_DATA_SIZE }, { read_ext_index_list, MGMT_READ_EXT_INDEX_LIST_SIZE, HCI_MGMT_NO_HDEV | HCI_MGMT_UNTRUSTED }, { read_adv_features, MGMT_READ_ADV_FEATURES_SIZE }, { add_advertising, MGMT_ADD_ADVERTISING_SIZE, HCI_MGMT_VAR_LEN }, { remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE }, { get_adv_size_info, MGMT_GET_ADV_SIZE_INFO_SIZE }, { start_limited_discovery, MGMT_START_DISCOVERY_SIZE }, { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE, HCI_MGMT_UNTRUSTED }, { set_appearance, MGMT_SET_APPEARANCE_SIZE }, { get_phy_configuration, MGMT_GET_PHY_CONFIGURATION_SIZE }, { set_phy_configuration, MGMT_SET_PHY_CONFIGURATION_SIZE }, { set_blocked_keys, MGMT_OP_SET_BLOCKED_KEYS_SIZE, HCI_MGMT_VAR_LEN }, { set_wideband_speech, MGMT_SETTING_SIZE }, { read_controller_cap, MGMT_READ_CONTROLLER_CAP_SIZE, HCI_MGMT_UNTRUSTED }, { read_exp_features_info, MGMT_READ_EXP_FEATURES_INFO_SIZE, HCI_MGMT_UNTRUSTED | HCI_MGMT_HDEV_OPTIONAL }, { set_exp_feature, MGMT_SET_EXP_FEATURE_SIZE, HCI_MGMT_VAR_LEN | HCI_MGMT_HDEV_OPTIONAL }, { read_def_system_config, MGMT_READ_DEF_SYSTEM_CONFIG_SIZE, HCI_MGMT_UNTRUSTED }, { set_def_system_config, MGMT_SET_DEF_SYSTEM_CONFIG_SIZE, HCI_MGMT_VAR_LEN }, { read_def_runtime_config, MGMT_READ_DEF_RUNTIME_CONFIG_SIZE, HCI_MGMT_UNTRUSTED }, { set_def_runtime_config, MGMT_SET_DEF_RUNTIME_CONFIG_SIZE, HCI_MGMT_VAR_LEN }, { get_device_flags, MGMT_GET_DEVICE_FLAGS_SIZE }, { set_device_flags, MGMT_SET_DEVICE_FLAGS_SIZE }, { read_adv_mon_features, MGMT_READ_ADV_MONITOR_FEATURES_SIZE }, { add_adv_patterns_monitor,MGMT_ADD_ADV_PATTERNS_MONITOR_SIZE, HCI_MGMT_VAR_LEN }, { remove_adv_monitor, MGMT_REMOVE_ADV_MONITOR_SIZE }, { add_ext_adv_params, MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE, HCI_MGMT_VAR_LEN }, { add_ext_adv_data, MGMT_ADD_EXT_ADV_DATA_SIZE, HCI_MGMT_VAR_LEN }, { add_adv_patterns_monitor_rssi, MGMT_ADD_ADV_PATTERNS_MONITOR_RSSI_SIZE, HCI_MGMT_VAR_LEN }, { set_mesh, MGMT_SET_MESH_RECEIVER_SIZE, HCI_MGMT_VAR_LEN }, { mesh_features, MGMT_MESH_READ_FEATURES_SIZE }, { mesh_send, MGMT_MESH_SEND_SIZE, HCI_MGMT_VAR_LEN }, { mesh_send_cancel, MGMT_MESH_SEND_CANCEL_SIZE }, { mgmt_hci_cmd_sync, MGMT_HCI_CMD_SYNC_SIZE, HCI_MGMT_VAR_LEN }, }; void mgmt_index_added(struct hci_dev *hdev) { struct mgmt_ev_ext_index ev; if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) return; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev, NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS); ev.type = 0x01; } else { mgmt_index_event(MGMT_EV_INDEX_ADDED, hdev, NULL, 0, HCI_MGMT_INDEX_EVENTS); ev.type = 0x00; } ev.bus = hdev->bus; mgmt_index_event(MGMT_EV_EXT_INDEX_ADDED, hdev, &ev, sizeof(ev), HCI_MGMT_EXT_INDEX_EVENTS); } void mgmt_index_removed(struct hci_dev *hdev) { struct mgmt_ev_ext_index ev; struct cmd_lookup match = { NULL, hdev, MGMT_STATUS_INVALID_INDEX }; if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) return; mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &match); if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { mgmt_index_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev, NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS); ev.type = 0x01; } else { mgmt_index_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0, HCI_MGMT_INDEX_EVENTS); ev.type = 0x00; } ev.bus = hdev->bus; mgmt_index_event(MGMT_EV_EXT_INDEX_REMOVED, hdev, &ev, sizeof(ev), HCI_MGMT_EXT_INDEX_EVENTS); /* Cancel any remaining timed work */ if (!hci_dev_test_flag(hdev, HCI_MGMT)) return; cancel_delayed_work_sync(&hdev->discov_off); cancel_delayed_work_sync(&hdev->service_cache); cancel_delayed_work_sync(&hdev->rpa_expired); } void mgmt_power_on(struct hci_dev *hdev, int err) { struct cmd_lookup match = { NULL, hdev }; bt_dev_dbg(hdev, "err %d", err); hci_dev_lock(hdev); if (!err) { restart_le_actions(hdev); hci_update_passive_scan(hdev); } mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); new_settings(hdev, match.sk); if (match.sk) sock_put(match.sk); hci_dev_unlock(hdev); } void __mgmt_power_off(struct hci_dev *hdev) { struct cmd_lookup match = { NULL, hdev }; u8 zero_cod[] = { 0, 0, 0 }; mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); /* If the power off is because of hdev unregistration let * use the appropriate INVALID_INDEX status. Otherwise use * NOT_POWERED. We cover both scenarios here since later in * mgmt_index_removed() any hci_conn callbacks will have already * been triggered, potentially causing misleading DISCONNECTED * status responses. */ if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) match.mgmt_status = MGMT_STATUS_INVALID_INDEX; else match.mgmt_status = MGMT_STATUS_NOT_POWERED; mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &match); if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) { mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, zero_cod, sizeof(zero_cod), HCI_MGMT_DEV_CLASS_EVENTS, NULL); ext_info_changed(hdev, NULL); } new_settings(hdev, match.sk); if (match.sk) sock_put(match.sk); } void mgmt_set_powered_failed(struct hci_dev *hdev, int err) { struct mgmt_pending_cmd *cmd; u8 status; cmd = pending_find(MGMT_OP_SET_POWERED, hdev); if (!cmd) return; if (err == -ERFKILL) status = MGMT_STATUS_RFKILLED; else status = MGMT_STATUS_FAILED; mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_POWERED, status); mgmt_pending_remove(cmd); } void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, bool persistent) { struct mgmt_ev_new_link_key ev; memset(&ev, 0, sizeof(ev)); ev.store_hint = persistent; bacpy(&ev.key.addr.bdaddr, &key->bdaddr); ev.key.addr.type = BDADDR_BREDR; ev.key.type = key->type; memcpy(ev.key.val, key->val, HCI_LINK_KEY_SIZE); ev.key.pin_len = key->pin_len; mgmt_event(MGMT_EV_NEW_LINK_KEY, hdev, &ev, sizeof(ev), NULL); } static u8 mgmt_ltk_type(struct smp_ltk *ltk) { switch (ltk->type) { case SMP_LTK: case SMP_LTK_RESPONDER: if (ltk->authenticated) return MGMT_LTK_AUTHENTICATED; return MGMT_LTK_UNAUTHENTICATED; case SMP_LTK_P256: if (ltk->authenticated) return MGMT_LTK_P256_AUTH; return MGMT_LTK_P256_UNAUTH; case SMP_LTK_P256_DEBUG: return MGMT_LTK_P256_DEBUG; } return MGMT_LTK_UNAUTHENTICATED; } void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) { struct mgmt_ev_new_long_term_key ev; memset(&ev, 0, sizeof(ev)); /* Devices using resolvable or non-resolvable random addresses * without providing an identity resolving key don't require * to store long term keys. Their addresses will change the * next time around. * * Only when a remote device provides an identity address * make sure the long term key is stored. If the remote * identity is known, the long term keys are internally * mapped to the identity address. So allow static random * and public addresses here. */ if (key->bdaddr_type == ADDR_LE_DEV_RANDOM && (key->bdaddr.b[5] & 0xc0) != 0xc0) ev.store_hint = 0x00; else ev.store_hint = persistent; bacpy(&ev.key.addr.bdaddr, &key->bdaddr); ev.key.addr.type = link_to_bdaddr(LE_LINK, key->bdaddr_type); ev.key.type = mgmt_ltk_type(key); ev.key.enc_size = key->enc_size; ev.key.ediv = key->ediv; ev.key.rand = key->rand; if (key->type == SMP_LTK) ev.key.initiator = 1; /* Make sure we copy only the significant bytes based on the * encryption key size, and set the rest of the value to zeroes. */ memcpy(ev.key.val, key->val, key->enc_size); memset(ev.key.val + key->enc_size, 0, sizeof(ev.key.val) - key->enc_size); mgmt_event(MGMT_EV_NEW_LONG_TERM_KEY, hdev, &ev, sizeof(ev), NULL); } void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk, bool persistent) { struct mgmt_ev_new_irk ev; memset(&ev, 0, sizeof(ev)); ev.store_hint = persistent; bacpy(&ev.rpa, &irk->rpa); bacpy(&ev.irk.addr.bdaddr, &irk->bdaddr); ev.irk.addr.type = link_to_bdaddr(LE_LINK, irk->addr_type); memcpy(ev.irk.val, irk->val, sizeof(irk->val)); mgmt_event(MGMT_EV_NEW_IRK, hdev, &ev, sizeof(ev), NULL); } void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk, bool persistent) { struct mgmt_ev_new_csrk ev; memset(&ev, 0, sizeof(ev)); /* Devices using resolvable or non-resolvable random addresses * without providing an identity resolving key don't require * to store signature resolving keys. Their addresses will change * the next time around. * * Only when a remote device provides an identity address * make sure the signature resolving key is stored. So allow * static random and public addresses here. */ if (csrk->bdaddr_type == ADDR_LE_DEV_RANDOM && (csrk->bdaddr.b[5] & 0xc0) != 0xc0) ev.store_hint = 0x00; else ev.store_hint = persistent; bacpy(&ev.key.addr.bdaddr, &csrk->bdaddr); ev.key.addr.type = link_to_bdaddr(LE_LINK, csrk->bdaddr_type); ev.key.type = csrk->type; memcpy(ev.key.val, csrk->val, sizeof(csrk->val)); mgmt_event(MGMT_EV_NEW_CSRK, hdev, &ev, sizeof(ev), NULL); } void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type, u8 store_hint, u16 min_interval, u16 max_interval, u16 latency, u16 timeout) { struct mgmt_ev_new_conn_param ev; if (!hci_is_identity_address(bdaddr, bdaddr_type)) return; memset(&ev, 0, sizeof(ev)); bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(LE_LINK, bdaddr_type); ev.store_hint = store_hint; ev.min_interval = cpu_to_le16(min_interval); ev.max_interval = cpu_to_le16(max_interval); ev.latency = cpu_to_le16(latency); ev.timeout = cpu_to_le16(timeout); mgmt_event(MGMT_EV_NEW_CONN_PARAM, hdev, &ev, sizeof(ev), NULL); } void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, u8 *name, u8 name_len) { struct sk_buff *skb; struct mgmt_ev_device_connected *ev; u16 eir_len = 0; u32 flags = 0; if (test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) return; /* allocate buff for LE or BR/EDR adv */ if (conn->le_adv_data_len > 0) skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_CONNECTED, sizeof(*ev) + conn->le_adv_data_len); else skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_CONNECTED, sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0) + eir_precalc_len(sizeof(conn->dev_class))); if (!skb) return; ev = skb_put(skb, sizeof(*ev)); bacpy(&ev->addr.bdaddr, &conn->dst); ev->addr.type = link_to_bdaddr(conn->type, conn->dst_type); if (conn->out) flags |= MGMT_DEV_FOUND_INITIATED_CONN; ev->flags = __cpu_to_le32(flags); /* We must ensure that the EIR Data fields are ordered and * unique. Keep it simple for now and avoid the problem by not * adding any BR/EDR data to the LE adv. */ if (conn->le_adv_data_len > 0) { skb_put_data(skb, conn->le_adv_data, conn->le_adv_data_len); eir_len = conn->le_adv_data_len; } else { if (name) eir_len += eir_skb_put_data(skb, EIR_NAME_COMPLETE, name, name_len); if (memcmp(conn->dev_class, "\0\0\0", sizeof(conn->dev_class))) eir_len += eir_skb_put_data(skb, EIR_CLASS_OF_DEV, conn->dev_class, sizeof(conn->dev_class)); } ev->eir_len = cpu_to_le16(eir_len); mgmt_event_skb(skb, NULL); } static void unpair_device_rsp(struct mgmt_pending_cmd *cmd, void *data) { struct hci_dev *hdev = data; struct mgmt_cp_unpair_device *cp = cmd->param; device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, cmd->sk); cmd->cmd_complete(cmd, 0); mgmt_pending_remove(cmd); } bool mgmt_powering_down(struct hci_dev *hdev) { struct mgmt_pending_cmd *cmd; struct mgmt_mode *cp; if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) return true; cmd = pending_find(MGMT_OP_SET_POWERED, hdev); if (!cmd) return false; cp = cmd->param; if (!cp->val) return true; return false; } void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 reason, bool mgmt_connected) { struct mgmt_ev_device_disconnected ev; struct sock *sk = NULL; if (!mgmt_connected) return; if (link_type != ACL_LINK && link_type != LE_LINK) return; bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); ev.reason = reason; /* Report disconnects due to suspend */ if (hdev->suspended) ev.reason = MGMT_DEV_DISCONN_LOCAL_HOST_SUSPEND; mgmt_event(MGMT_EV_DEVICE_DISCONNECTED, hdev, &ev, sizeof(ev), sk); if (sk) sock_put(sk); } void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status) { u8 bdaddr_type = link_to_bdaddr(link_type, addr_type); struct mgmt_cp_disconnect *cp; struct mgmt_pending_cmd *cmd; mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, unpair_device_rsp, hdev); cmd = pending_find(MGMT_OP_DISCONNECT, hdev); if (!cmd) return; cp = cmd->param; if (bacmp(bdaddr, &cp->addr.bdaddr)) return; if (cp->addr.type != bdaddr_type) return; cmd->cmd_complete(cmd, mgmt_status(status)); mgmt_pending_remove(cmd); } void mgmt_connect_failed(struct hci_dev *hdev, struct hci_conn *conn, u8 status) { struct mgmt_ev_connect_failed ev; if (test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) { mgmt_device_disconnected(hdev, &conn->dst, conn->type, conn->dst_type, status, true); return; } bacpy(&ev.addr.bdaddr, &conn->dst); ev.addr.type = link_to_bdaddr(conn->type, conn->dst_type); ev.status = mgmt_status(status); mgmt_event(MGMT_EV_CONNECT_FAILED, hdev, &ev, sizeof(ev), NULL); } void mgmt_pin_code_request(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 secure) { struct mgmt_ev_pin_code_request ev; bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = BDADDR_BREDR; ev.secure = secure; mgmt_event(MGMT_EV_PIN_CODE_REQUEST, hdev, &ev, sizeof(ev), NULL); } void mgmt_pin_code_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 status) { struct mgmt_pending_cmd *cmd; cmd = pending_find(MGMT_OP_PIN_CODE_REPLY, hdev); if (!cmd) return; cmd->cmd_complete(cmd, mgmt_status(status)); mgmt_pending_remove(cmd); } void mgmt_pin_code_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 status) { struct mgmt_pending_cmd *cmd; cmd = pending_find(MGMT_OP_PIN_CODE_NEG_REPLY, hdev); if (!cmd) return; cmd->cmd_complete(cmd, mgmt_status(status)); mgmt_pending_remove(cmd); } int mgmt_user_confirm_request(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u32 value, u8 confirm_hint) { struct mgmt_ev_user_confirm_request ev; bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); ev.confirm_hint = confirm_hint; ev.value = cpu_to_le32(value); return mgmt_event(MGMT_EV_USER_CONFIRM_REQUEST, hdev, &ev, sizeof(ev), NULL); } int mgmt_user_passkey_request(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type) { struct mgmt_ev_user_passkey_request ev; bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); return mgmt_event(MGMT_EV_USER_PASSKEY_REQUEST, hdev, &ev, sizeof(ev), NULL); } static int user_pairing_resp_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status, u8 opcode) { struct mgmt_pending_cmd *cmd; cmd = pending_find(opcode, hdev); if (!cmd) return -ENOENT; cmd->cmd_complete(cmd, mgmt_status(status)); mgmt_pending_remove(cmd); return 0; } int mgmt_user_confirm_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status) { return user_pairing_resp_complete(hdev, bdaddr, link_type, addr_type, status, MGMT_OP_USER_CONFIRM_REPLY); } int mgmt_user_confirm_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status) { return user_pairing_resp_complete(hdev, bdaddr, link_type, addr_type, status, MGMT_OP_USER_CONFIRM_NEG_REPLY); } int mgmt_user_passkey_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status) { return user_pairing_resp_complete(hdev, bdaddr, link_type, addr_type, status, MGMT_OP_USER_PASSKEY_REPLY); } int mgmt_user_passkey_neg_reply_complete(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 status) { return user_pairing_resp_complete(hdev, bdaddr, link_type, addr_type, status, MGMT_OP_USER_PASSKEY_NEG_REPLY); } int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u32 passkey, u8 entered) { struct mgmt_ev_passkey_notify ev; bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); ev.passkey = __cpu_to_le32(passkey); ev.entered = entered; return mgmt_event(MGMT_EV_PASSKEY_NOTIFY, hdev, &ev, sizeof(ev), NULL); } void mgmt_auth_failed(struct hci_conn *conn, u8 hci_status) { struct mgmt_ev_auth_failed ev; struct mgmt_pending_cmd *cmd; u8 status = mgmt_status(hci_status); bacpy(&ev.addr.bdaddr, &conn->dst); ev.addr.type = link_to_bdaddr(conn->type, conn->dst_type); ev.status = status; cmd = find_pairing(conn); mgmt_event(MGMT_EV_AUTH_FAILED, conn->hdev, &ev, sizeof(ev), cmd ? cmd->sk : NULL); if (cmd) { cmd->cmd_complete(cmd, status); mgmt_pending_remove(cmd); } } void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status) { struct cmd_lookup match = { NULL, hdev }; bool changed; if (status) { u8 mgmt_err = mgmt_status(status); mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, cmd_status_rsp, &mgmt_err); return; } if (test_bit(HCI_AUTH, &hdev->flags)) changed = !hci_dev_test_and_set_flag(hdev, HCI_LINK_SECURITY); else changed = hci_dev_test_and_clear_flag(hdev, HCI_LINK_SECURITY); mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, settings_rsp, &match); if (changed) new_settings(hdev, match.sk); if (match.sk) sock_put(match.sk); } static void sk_lookup(struct mgmt_pending_cmd *cmd, void *data) { struct cmd_lookup *match = data; if (match->sk == NULL) { match->sk = cmd->sk; sock_hold(match->sk); } } void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, u8 status) { struct cmd_lookup match = { NULL, hdev, mgmt_status(status) }; mgmt_pending_foreach(MGMT_OP_SET_DEV_CLASS, hdev, sk_lookup, &match); mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match); mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match); if (!status) { mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class, 3, HCI_MGMT_DEV_CLASS_EVENTS, NULL); ext_info_changed(hdev, NULL); } if (match.sk) sock_put(match.sk); } void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status) { struct mgmt_cp_set_local_name ev; struct mgmt_pending_cmd *cmd; if (status) return; memset(&ev, 0, sizeof(ev)); memcpy(ev.name, name, HCI_MAX_NAME_LENGTH); memcpy(ev.short_name, hdev->short_name, HCI_MAX_SHORT_NAME_LENGTH); cmd = pending_find(MGMT_OP_SET_LOCAL_NAME, hdev); if (!cmd) { memcpy(hdev->dev_name, name, sizeof(hdev->dev_name)); /* If this is a HCI command related to powering on the * HCI dev don't send any mgmt signals. */ if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) return; if (pending_find(MGMT_OP_SET_POWERED, hdev)) return; } mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev), HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL); ext_info_changed(hdev, cmd ? cmd->sk : NULL); } static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16]) { int i; for (i = 0; i < uuid_count; i++) { if (!memcmp(uuid, uuids[i], 16)) return true; } return false; } static bool eir_has_uuids(u8 *eir, u16 eir_len, u16 uuid_count, u8 (*uuids)[16]) { u16 parsed = 0; while (parsed < eir_len) { u8 field_len = eir[0]; u8 uuid[16]; int i; if (field_len == 0) break; if (eir_len - parsed < field_len + 1) break; switch (eir[1]) { case EIR_UUID16_ALL: case EIR_UUID16_SOME: for (i = 0; i + 3 <= field_len; i += 2) { memcpy(uuid, bluetooth_base_uuid, 16); uuid[13] = eir[i + 3]; uuid[12] = eir[i + 2]; if (has_uuid(uuid, uuid_count, uuids)) return true; } break; case EIR_UUID32_ALL: case EIR_UUID32_SOME: for (i = 0; i + 5 <= field_len; i += 4) { memcpy(uuid, bluetooth_base_uuid, 16); uuid[15] = eir[i + 5]; uuid[14] = eir[i + 4]; uuid[13] = eir[i + 3]; uuid[12] = eir[i + 2]; if (has_uuid(uuid, uuid_count, uuids)) return true; } break; case EIR_UUID128_ALL: case EIR_UUID128_SOME: for (i = 0; i + 17 <= field_len; i += 16) { memcpy(uuid, eir + i + 2, 16); if (has_uuid(uuid, uuid_count, uuids)) return true; } break; } parsed += field_len + 1; eir += field_len + 1; } return false; } static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len) { /* If a RSSI threshold has been specified, and * HCI_QUIRK_STRICT_DUPLICATE_FILTER is not set, then all results with * a RSSI smaller than the RSSI threshold will be dropped. If the quirk * is set, let it through for further processing, as we might need to * restart the scan. * * For BR/EDR devices (pre 1.2) providing no RSSI during inquiry, * the results are also dropped. */ if (hdev->discovery.rssi != HCI_RSSI_INVALID && (rssi == HCI_RSSI_INVALID || (rssi < hdev->discovery.rssi && !test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks)))) return false; if (hdev->discovery.uuid_count != 0) { /* If a list of UUIDs is provided in filter, results with no * matching UUID should be dropped. */ if (!eir_has_uuids(eir, eir_len, hdev->discovery.uuid_count, hdev->discovery.uuids) && !eir_has_uuids(scan_rsp, scan_rsp_len, hdev->discovery.uuid_count, hdev->discovery.uuids)) return false; } /* If duplicate filtering does not report RSSI changes, then restart * scanning to ensure updated result with updated RSSI values. */ if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks)) { /* Validate RSSI value against the RSSI threshold once more. */ if (hdev->discovery.rssi != HCI_RSSI_INVALID && rssi < hdev->discovery.rssi) return false; } return true; } void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle, bdaddr_t *bdaddr, u8 addr_type) { struct mgmt_ev_adv_monitor_device_lost ev; ev.monitor_handle = cpu_to_le16(handle); bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = addr_type; mgmt_event(MGMT_EV_ADV_MONITOR_DEVICE_LOST, hdev, &ev, sizeof(ev), NULL); } static void mgmt_send_adv_monitor_device_found(struct hci_dev *hdev, struct sk_buff *skb, struct sock *skip_sk, u16 handle) { struct sk_buff *advmon_skb; size_t advmon_skb_len; __le16 *monitor_handle; if (!skb) return; advmon_skb_len = (sizeof(struct mgmt_ev_adv_monitor_device_found) - sizeof(struct mgmt_ev_device_found)) + skb->len; advmon_skb = mgmt_alloc_skb(hdev, MGMT_EV_ADV_MONITOR_DEVICE_FOUND, advmon_skb_len); if (!advmon_skb) return; /* ADV_MONITOR_DEVICE_FOUND is similar to DEVICE_FOUND event except * that it also has 'monitor_handle'. Make a copy of DEVICE_FOUND and * store monitor_handle of the matched monitor. */ monitor_handle = skb_put(advmon_skb, sizeof(*monitor_handle)); *monitor_handle = cpu_to_le16(handle); skb_put_data(advmon_skb, skb->data, skb->len); mgmt_event_skb(advmon_skb, skip_sk); } static void mgmt_adv_monitor_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, bool report_device, struct sk_buff *skb, struct sock *skip_sk) { struct monitored_device *dev, *tmp; bool matched = false; bool notified = false; /* We have received the Advertisement Report because: * 1. the kernel has initiated active discovery * 2. if not, we have pend_le_reports > 0 in which case we are doing * passive scanning * 3. if none of the above is true, we have one or more active * Advertisement Monitor * * For case 1 and 2, report all advertisements via MGMT_EV_DEVICE_FOUND * and report ONLY one advertisement per device for the matched Monitor * via MGMT_EV_ADV_MONITOR_DEVICE_FOUND event. * * For case 3, since we are not active scanning and all advertisements * received are due to a matched Advertisement Monitor, report all * advertisements ONLY via MGMT_EV_ADV_MONITOR_DEVICE_FOUND event. */ if (report_device && !hdev->advmon_pend_notify) { mgmt_event_skb(skb, skip_sk); return; } hdev->advmon_pend_notify = false; list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) { if (!bacmp(&dev->bdaddr, bdaddr)) { matched = true; if (!dev->notified) { mgmt_send_adv_monitor_device_found(hdev, skb, skip_sk, dev->handle); notified = true; dev->notified = true; } } if (!dev->notified) hdev->advmon_pend_notify = true; } if (!report_device && ((matched && !notified) || !msft_monitor_supported(hdev))) { /* Handle 0 indicates that we are not active scanning and this * is a subsequent advertisement report for an already matched * Advertisement Monitor or the controller offloading support * is not available. */ mgmt_send_adv_monitor_device_found(hdev, skb, skip_sk, 0); } if (report_device) mgmt_event_skb(skb, skip_sk); else kfree_skb(skb); } static void mesh_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, s8 rssi, u32 flags, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len, u64 instant) { struct sk_buff *skb; struct mgmt_ev_mesh_device_found *ev; int i, j; if (!hdev->mesh_ad_types[0]) goto accepted; /* Scan for requested AD types */ if (eir_len > 0) { for (i = 0; i + 1 < eir_len; i += eir[i] + 1) { for (j = 0; j < sizeof(hdev->mesh_ad_types); j++) { if (!hdev->mesh_ad_types[j]) break; if (hdev->mesh_ad_types[j] == eir[i + 1]) goto accepted; } } } if (scan_rsp_len > 0) { for (i = 0; i + 1 < scan_rsp_len; i += scan_rsp[i] + 1) { for (j = 0; j < sizeof(hdev->mesh_ad_types); j++) { if (!hdev->mesh_ad_types[j]) break; if (hdev->mesh_ad_types[j] == scan_rsp[i + 1]) goto accepted; } } } return; accepted: skb = mgmt_alloc_skb(hdev, MGMT_EV_MESH_DEVICE_FOUND, sizeof(*ev) + eir_len + scan_rsp_len); if (!skb) return; ev = skb_put(skb, sizeof(*ev)); bacpy(&ev->addr.bdaddr, bdaddr); ev->addr.type = link_to_bdaddr(LE_LINK, addr_type); ev->rssi = rssi; ev->flags = cpu_to_le32(flags); ev->instant = cpu_to_le64(instant); if (eir_len > 0) /* Copy EIR or advertising data into event */ skb_put_data(skb, eir, eir_len); if (scan_rsp_len > 0) /* Append scan response data to event */ skb_put_data(skb, scan_rsp, scan_rsp_len); ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len); mgmt_event_skb(skb, NULL); } void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len, u64 instant) { struct sk_buff *skb; struct mgmt_ev_device_found *ev; bool report_device = hci_discovery_active(hdev); if (hci_dev_test_flag(hdev, HCI_MESH) && link_type == LE_LINK) mesh_device_found(hdev, bdaddr, addr_type, rssi, flags, eir, eir_len, scan_rsp, scan_rsp_len, instant); /* Don't send events for a non-kernel initiated discovery. With * LE one exception is if we have pend_le_reports > 0 in which * case we're doing passive scanning and want these events. */ if (!hci_discovery_active(hdev)) { if (link_type == ACL_LINK) return; if (link_type == LE_LINK && !list_empty(&hdev->pend_le_reports)) report_device = true; else if (!hci_is_adv_monitoring(hdev)) return; } if (hdev->discovery.result_filtering) { /* We are using service discovery */ if (!is_filter_match(hdev, rssi, eir, eir_len, scan_rsp, scan_rsp_len)) return; } if (hdev->discovery.limited) { /* Check for limited discoverable bit */ if (dev_class) { if (!(dev_class[1] & 0x20)) return; } else { u8 *flags = eir_get_data(eir, eir_len, EIR_FLAGS, NULL); if (!flags || !(flags[0] & LE_AD_LIMITED)) return; } } /* Allocate skb. The 5 extra bytes are for the potential CoD field */ skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND, sizeof(*ev) + eir_len + scan_rsp_len + 5); if (!skb) return; ev = skb_put(skb, sizeof(*ev)); /* In case of device discovery with BR/EDR devices (pre 1.2), the * RSSI value was reported as 0 when not available. This behavior * is kept when using device discovery. This is required for full * backwards compatibility with the API. * * However when using service discovery, the value 127 will be * returned when the RSSI is not available. */ if (rssi == HCI_RSSI_INVALID && !hdev->discovery.report_invalid_rssi && link_type == ACL_LINK) rssi = 0; bacpy(&ev->addr.bdaddr, bdaddr); ev->addr.type = link_to_bdaddr(link_type, addr_type); ev->rssi = rssi; ev->flags = cpu_to_le32(flags); if (eir_len > 0) /* Copy EIR or advertising data into event */ skb_put_data(skb, eir, eir_len); if (dev_class && !eir_get_data(eir, eir_len, EIR_CLASS_OF_DEV, NULL)) { u8 eir_cod[5]; eir_len += eir_append_data(eir_cod, 0, EIR_CLASS_OF_DEV, dev_class, 3); skb_put_data(skb, eir_cod, sizeof(eir_cod)); } if (scan_rsp_len > 0) /* Append scan response data to event */ skb_put_data(skb, scan_rsp, scan_rsp_len); ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len); mgmt_adv_monitor_device_found(hdev, bdaddr, report_device, skb, NULL); } void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, s8 rssi, u8 *name, u8 name_len) { struct sk_buff *skb; struct mgmt_ev_device_found *ev; u16 eir_len = 0; u32 flags = 0; skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND, sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0)); if (!skb) return; ev = skb_put(skb, sizeof(*ev)); bacpy(&ev->addr.bdaddr, bdaddr); ev->addr.type = link_to_bdaddr(link_type, addr_type); ev->rssi = rssi; if (name) eir_len += eir_skb_put_data(skb, EIR_NAME_COMPLETE, name, name_len); else flags = MGMT_DEV_FOUND_NAME_REQUEST_FAILED; ev->eir_len = cpu_to_le16(eir_len); ev->flags = cpu_to_le32(flags); mgmt_event_skb(skb, NULL); } void mgmt_discovering(struct hci_dev *hdev, u8 discovering) { struct mgmt_ev_discovering ev; bt_dev_dbg(hdev, "discovering %u", discovering); memset(&ev, 0, sizeof(ev)); ev.type = hdev->discovery.type; ev.discovering = discovering; mgmt_event(MGMT_EV_DISCOVERING, hdev, &ev, sizeof(ev), NULL); } void mgmt_suspending(struct hci_dev *hdev, u8 state) { struct mgmt_ev_controller_suspend ev; ev.suspend_state = state; mgmt_event(MGMT_EV_CONTROLLER_SUSPEND, hdev, &ev, sizeof(ev), NULL); } void mgmt_resuming(struct hci_dev *hdev, u8 reason, bdaddr_t *bdaddr, u8 addr_type) { struct mgmt_ev_controller_resume ev; ev.wake_reason = reason; if (bdaddr) { bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = addr_type; } else { memset(&ev.addr, 0, sizeof(ev.addr)); } mgmt_event(MGMT_EV_CONTROLLER_RESUME, hdev, &ev, sizeof(ev), NULL); } static struct hci_mgmt_chan chan = { .channel = HCI_CHANNEL_CONTROL, .handler_count = ARRAY_SIZE(mgmt_handlers), .handlers = mgmt_handlers, .hdev_init = mgmt_init_hdev, }; int mgmt_init(void) { return hci_mgmt_chan_register(&chan); } void mgmt_exit(void) { hci_mgmt_chan_unregister(&chan); } void mgmt_cleanup(struct sock *sk) { struct mgmt_mesh_tx *mesh_tx; struct hci_dev *hdev; read_lock(&hci_dev_list_lock); list_for_each_entry(hdev, &hci_dev_list, list) { do { mesh_tx = mgmt_mesh_next(hdev, sk); if (mesh_tx) mesh_send_complete(hdev, mesh_tx, true); } while (mesh_tx); } read_unlock(&hci_dev_list_lock); } |
4069 4066 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. */ #include <linux/netdevice.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include "datapath.h" #include "vport-internal_dev.h" #include "vport-netdev.h" static void dp_detach_port_notify(struct vport *vport) { struct sk_buff *notify; struct datapath *dp; dp = vport->dp; notify = ovs_vport_cmd_build_info(vport, ovs_dp_get_net(dp), 0, 0, OVS_VPORT_CMD_DEL); ovs_dp_detach_port(vport); if (IS_ERR(notify)) { genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0, 0, PTR_ERR(notify)); return; } genlmsg_multicast_netns(&dp_vport_genl_family, ovs_dp_get_net(dp), notify, 0, 0, GFP_KERNEL); } void ovs_dp_notify_wq(struct work_struct *work) { struct ovs_net *ovs_net = container_of(work, struct ovs_net, dp_notify_work); struct datapath *dp; ovs_lock(); list_for_each_entry(dp, &ovs_net->dps, list_node) { int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL) continue; if (!(netif_is_ovs_port(vport->dev))) dp_detach_port_notify(vport); } } } ovs_unlock(); } static int dp_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct ovs_net *ovs_net; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vport *vport = NULL; if (!ovs_is_internal_dev(dev)) vport = ovs_netdev_get_vport(dev); if (!vport) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { /* upper_dev_unlink and decrement promisc immediately */ ovs_netdev_detach_dev(vport); /* schedule vport destroy, dev_put and genl notification */ ovs_net = net_generic(dev_net(dev), ovs_net_id); queue_work(system_wq, &ovs_net->dp_notify_work); } return NOTIFY_DONE; } struct notifier_block ovs_dp_device_notifier = { .notifier_call = dp_device_event }; |
4065 3930 115 4028 292 22 3988 259 5 68 68 49 25 45 36 10 18 5 360 504 399 178 308 175 168 4 4 3 8 7 6 470 308 212 159 30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic parts * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/init.h> #include <linux/llc.h> #include <net/llc.h> #include <net/stp.h> #include <net/switchdev.h> #include "br_private.h" /* * Handle changes in state of network devices enslaved to a bridge. * * Note: don't care about up/down if bridge itself is down, because * port state is checked when bridge is brought up. */ static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct netdev_notifier_pre_changeaddr_info *prechaddr_info; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; bool notified = false; bool changed_addr; int err; if (netif_is_bridge_master(dev)) { err = br_vlan_bridge_event(dev, event, ptr); if (err) return notifier_from_errno(err); if (event == NETDEV_REGISTER) { /* register of bridge completed, add sysfs entries */ err = br_sysfs_addbr(dev); if (err) return notifier_from_errno(err); return NOTIFY_DONE; } } if (is_vlan_dev(dev)) { struct net_device *real_dev = vlan_dev_real_dev(dev); if (netif_is_bridge_master(real_dev)) br_vlan_vlan_upper_event(real_dev, dev, event); } /* not a port of a bridge */ p = br_port_get_rtnl(dev); if (!p) return NOTIFY_DONE; br = p->br; switch (event) { case NETDEV_CHANGEMTU: br_mtu_auto_adjust(br); break; case NETDEV_PRE_CHANGEADDR: if (br->dev->addr_assign_type == NET_ADDR_SET) break; prechaddr_info = ptr; err = dev_pre_changeaddr_notify(br->dev, prechaddr_info->dev_addr, extack); if (err) return notifier_from_errno(err); break; case NETDEV_CHANGEADDR: spin_lock_bh(&br->lock); br_fdb_changeaddr(p, dev->dev_addr); changed_addr = br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); break; case NETDEV_CHANGE: br_port_carrier_check(p, ¬ified); break; case NETDEV_FEAT_CHANGE: netdev_update_features(br->dev); break; case NETDEV_DOWN: spin_lock_bh(&br->lock); if (br->dev->flags & IFF_UP) { br_stp_disable_port(p); notified = true; } spin_unlock_bh(&br->lock); break; case NETDEV_UP: if (netif_running(br->dev) && netif_oper_up(dev)) { spin_lock_bh(&br->lock); br_stp_enable_port(p); notified = true; spin_unlock_bh(&br->lock); } break; case NETDEV_UNREGISTER: br_del_if(br, dev); break; case NETDEV_CHANGENAME: err = br_sysfs_renameif(p); if (err) return notifier_from_errno(err); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_RESEND_IGMP: /* Propagate to master device */ call_netdevice_notifiers(event, br->dev); break; } if (event != NETDEV_UNREGISTER) br_vlan_port_event(p, event); /* Events that may cause spanning tree to refresh */ if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP || event == NETDEV_CHANGE || event == NETDEV_DOWN)) br_ifinfo_notify(RTM_NEWLINK, NULL, p); return NOTIFY_DONE; } static struct notifier_block br_device_notifier = { .notifier_call = br_device_event }; /* called with RTNL or RCU */ static int br_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; struct switchdev_notifier_fdb_info *fdb_info; int err = NOTIFY_DONE; p = br_port_get_rtnl_rcu(dev); if (!p) goto out; br = p->br; switch (event) { case SWITCHDEV_FDB_ADD_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_add(br, p, fdb_info->addr, fdb_info->vid, fdb_info->locked, false); if (err) { err = notifier_from_errno(err); break; } br_fdb_offloaded_set(br, p, fdb_info->addr, fdb_info->vid, fdb_info->offloaded); break; case SWITCHDEV_FDB_DEL_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_del(br, p, fdb_info->addr, fdb_info->vid, false); if (err) err = notifier_from_errno(err); break; case SWITCHDEV_FDB_OFFLOADED: fdb_info = ptr; br_fdb_offloaded_set(br, p, fdb_info->addr, fdb_info->vid, fdb_info->offloaded); break; case SWITCHDEV_FDB_FLUSH_TO_BRIDGE: fdb_info = ptr; /* Don't delete static entries */ br_fdb_delete_by_port(br, p, fdb_info->vid, 0); break; } out: return err; } static struct notifier_block br_switchdev_notifier = { .notifier_call = br_switchdev_event, }; /* called under rtnl_mutex */ static int br_switchdev_blocking_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct switchdev_notifier_brport_info *brport_info; const struct switchdev_brport *b; struct net_bridge_port *p; int err = NOTIFY_DONE; p = br_port_get_rtnl(dev); if (!p) goto out; switch (event) { case SWITCHDEV_BRPORT_OFFLOADED: brport_info = ptr; b = &brport_info->brport; err = br_switchdev_port_offload(p, b->dev, b->ctx, b->atomic_nb, b->blocking_nb, b->tx_fwd_offload, extack); err = notifier_from_errno(err); break; case SWITCHDEV_BRPORT_UNOFFLOADED: brport_info = ptr; b = &brport_info->brport; br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb, b->blocking_nb); break; case SWITCHDEV_BRPORT_REPLAY: brport_info = ptr; b = &brport_info->brport; err = br_switchdev_port_replay(p, b->dev, b->ctx, b->atomic_nb, b->blocking_nb, extack); err = notifier_from_errno(err); break; } out: return err; } static struct notifier_block br_switchdev_blocking_notifier = { .notifier_call = br_switchdev_blocking_event, }; /* br_boolopt_toggle - change user-controlled boolean option * * @br: bridge device * @opt: id of the option to change * @on: new option value * @extack: extack for error messages * * Changes the value of the respective boolean option to @on taking care of * any internal option value mapping and configuration. */ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, struct netlink_ext_ack *extack) { int err = 0; switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: br_opt_toggle(br, BROPT_NO_LL_LEARN, on); break; case BR_BOOLOPT_MCAST_VLAN_SNOOPING: err = br_multicast_toggle_vlan_snooping(br, on, extack); break; case BR_BOOLOPT_MST_ENABLE: err = br_mst_set_enabled(br, on, extack); break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); break; } return err; } int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) { switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: return br_opt_get(br, BROPT_NO_LL_LEARN); case BR_BOOLOPT_MCAST_VLAN_SNOOPING: return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); case BR_BOOLOPT_MST_ENABLE: return br_opt_get(br, BROPT_MST_ENABLED); default: /* shouldn't be called with unsupported options */ WARN_ON(1); break; } return 0; } int br_boolopt_multi_toggle(struct net_bridge *br, struct br_boolopt_multi *bm, struct netlink_ext_ack *extack) { unsigned long bitmap = bm->optmask; int err = 0; int opt_id; for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { bool on = !!(bm->optval & BIT(opt_id)); err = br_boolopt_toggle(br, opt_id, on, extack); if (err) { br_debug(br, "boolopt multi-toggle error: option: %d current: %d new: %d error: %d\n", opt_id, br_boolopt_get(br, opt_id), on, err); break; } } return err; } void br_boolopt_multi_get(const struct net_bridge *br, struct br_boolopt_multi *bm) { u32 optval = 0; int opt_id; for (opt_id = 0; opt_id < BR_BOOLOPT_MAX; opt_id++) optval |= (br_boolopt_get(br, opt_id) << opt_id); bm->optval = optval; bm->optmask = GENMASK((BR_BOOLOPT_MAX - 1), 0); } /* private bridge options, controlled by the kernel */ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) { bool cur = !!br_opt_get(br, opt); br_debug(br, "toggle option: %d state: %d -> %d\n", opt, cur, on); if (cur == on) return; if (on) set_bit(opt, &br->options); else clear_bit(opt, &br->options); } static void __net_exit br_net_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net_device *dev; struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) for_each_netdev(net, dev) if (netif_is_bridge_master(dev)) br_dev_delete(dev, dev_to_kill); } static struct pernet_operations br_net_ops = { .exit_batch_rtnl = br_net_exit_batch_rtnl, }; static const struct stp_proto br_stp_proto = { .rcv = br_stp_rcv, }; static int __init br_init(void) { int err; BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > sizeof_field(struct sk_buff, cb)); err = stp_proto_register(&br_stp_proto); if (err < 0) { pr_err("bridge: can't register sap for STP\n"); return err; } err = br_fdb_init(); if (err) goto err_out; err = register_pernet_subsys(&br_net_ops); if (err) goto err_out1; err = br_nf_core_init(); if (err) goto err_out2; err = register_netdevice_notifier(&br_device_notifier); if (err) goto err_out3; err = register_switchdev_notifier(&br_switchdev_notifier); if (err) goto err_out4; err = register_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); if (err) goto err_out5; err = br_netlink_init(); if (err) goto err_out6; brioctl_set(br_ioctl_stub); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = br_fdb_test_addr; #endif #if IS_MODULE(CONFIG_BRIDGE_NETFILTER) pr_info("bridge: filtering via arp/ip/ip6tables is no longer available " "by default. Update your scripts to load br_netfilter if you " "need this.\n"); #endif return 0; err_out6: unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); err_out5: unregister_switchdev_notifier(&br_switchdev_notifier); err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: br_nf_core_fini(); err_out2: unregister_pernet_subsys(&br_net_ops); err_out1: br_fdb_fini(); err_out: stp_proto_unregister(&br_stp_proto); return err; } static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); br_netlink_fini(); unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); unregister_switchdev_notifier(&br_switchdev_notifier); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); unregister_pernet_subsys(&br_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ br_nf_core_fini(); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = NULL; #endif br_fdb_fini(); } module_init(br_init) module_exit(br_deinit) MODULE_LICENSE("GPL"); MODULE_VERSION(BR_VERSION); MODULE_ALIAS_RTNL_LINK("bridge"); MODULE_DESCRIPTION("Ethernet bridge driver"); |
178 178 178 178 178 178 108 108 108 108 99 27 79 9 75 4 80 178 178 1 1 107 108 4437 4436 4439 38 167 4431 211 40 13 40 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 | // SPDX-License-Identifier: GPL-2.0 /* * Workingset detection * * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner */ #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/writeback.h> #include <linux/shmem_fs.h> #include <linux/pagemap.h> #include <linux/atomic.h> #include <linux/module.h> #include <linux/swap.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/mm.h> #include "internal.h" /* * Double CLOCK lists * * Per node, two clock lists are maintained for file pages: the * inactive and the active list. Freshly faulted pages start out at * the head of the inactive list and page reclaim scans pages from the * tail. Pages that are accessed multiple times on the inactive list * are promoted to the active list, to protect them from reclaim, * whereas active pages are demoted to the inactive list when the * active list grows too big. * * fault ------------------------+ * | * +--------------+ | +-------------+ * reclaim <- | inactive | <-+-- demotion | active | <--+ * +--------------+ +-------------+ | * | | * +-------------- promotion ------------------+ * * * Access frequency and refault distance * * A workload is thrashing when its pages are frequently used but they * are evicted from the inactive list every time before another access * would have promoted them to the active list. * * In cases where the average access distance between thrashing pages * is bigger than the size of memory there is nothing that can be * done - the thrashing set could never fit into memory under any * circumstance. * * However, the average access distance could be bigger than the * inactive list, yet smaller than the size of memory. In this case, * the set could fit into memory if it weren't for the currently * active pages - which may be used more, hopefully less frequently: * * +-memory available to cache-+ * | | * +-inactive------+-active----+ * a b | c d e f g h i | J K L M N | * +---------------+-----------+ * * It is prohibitively expensive to accurately track access frequency * of pages. But a reasonable approximation can be made to measure * thrashing on the inactive list, after which refaulting pages can be * activated optimistically to compete with the existing active pages. * * Approximating inactive page access frequency - Observations: * * 1. When a page is accessed for the first time, it is added to the * head of the inactive list, slides every existing inactive page * towards the tail by one slot, and pushes the current tail page * out of memory. * * 2. When a page is accessed for the second time, it is promoted to * the active list, shrinking the inactive list by one slot. This * also slides all inactive pages that were faulted into the cache * more recently than the activated page towards the tail of the * inactive list. * * Thus: * * 1. The sum of evictions and activations between any two points in * time indicate the minimum number of inactive pages accessed in * between. * * 2. Moving one inactive page N page slots towards the tail of the * list requires at least N inactive page accesses. * * Combining these: * * 1. When a page is finally evicted from memory, the number of * inactive pages accessed while the page was in cache is at least * the number of page slots on the inactive list. * * 2. In addition, measuring the sum of evictions and activations (E) * at the time of a page's eviction, and comparing it to another * reading (R) at the time the page faults back into memory tells * the minimum number of accesses while the page was not cached. * This is called the refault distance. * * Because the first access of the page was the fault and the second * access the refault, we combine the in-cache distance with the * out-of-cache distance to get the complete minimum access distance * of this page: * * NR_inactive + (R - E) * * And knowing the minimum access distance of a page, we can easily * tell if the page would be able to stay in cache assuming all page * slots in the cache were available: * * NR_inactive + (R - E) <= NR_inactive + NR_active * * If we have swap we should consider about NR_inactive_anon and * NR_active_anon, so for page cache and anonymous respectively: * * NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file * + NR_inactive_anon + NR_active_anon * * NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon * + NR_inactive_file + NR_active_file * * Which can be further simplified to: * * (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon * * (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file * * Put into words, the refault distance (out-of-cache) can be seen as * a deficit in inactive list space (in-cache). If the inactive list * had (R - E) more page slots, the page would not have been evicted * in between accesses, but activated instead. And on a full system, * the only thing eating into inactive list space is active pages. * * * Refaulting inactive pages * * All that is known about the active list is that the pages have been * accessed more than once in the past. This means that at any given * time there is actually a good chance that pages on the active list * are no longer in active use. * * So when a refault distance of (R - E) is observed and there are at * least (R - E) pages in the userspace workingset, the refaulting page * is activated optimistically in the hope that (R - E) pages are actually * used less frequently than the refaulting page - or even not used at * all anymore. * * That means if inactive cache is refaulting with a suitable refault * distance, we assume the cache workingset is transitioning and put * pressure on the current workingset. * * If this is wrong and demotion kicks in, the pages which are truly * used more frequently will be reactivated while the less frequently * used once will be evicted from memory. * * But if this is right, the stale pages will be pushed out of memory * and the used pages get to stay in cache. * * Refaulting active pages * * If on the other hand the refaulting pages have recently been * deactivated, it means that the active list is no longer protecting * actively used cache from reclaim. The cache is NOT transitioning to * a different workingset; the existing workingset is thrashing in the * space allocated to the page cache. * * * Implementation * * For each node's LRU lists, a counter for inactive evictions and * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible * refault distance will immediately activate the refaulting page. */ #define WORKINGSET_SHIFT 1 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ WORKINGSET_SHIFT + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) /* * Eviction timestamps need to be able to cover the full range of * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group * evictions into coarser buckets by shaving off lower timestamp bits. */ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; eviction = (eviction << WORKINGSET_SHIFT) | workingset; return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp, bool *workingsetp) { unsigned long entry = xa_to_value(shadow); int memcgid, nid; bool workingset; workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); entry >>= WORKINGSET_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); entry >>= MEM_CGROUP_ID_SHIFT; *memcgidp = memcgid; *pgdat = NODE_DATA(nid); *evictionp = entry; *workingsetp = workingset; } #ifdef CONFIG_LRU_GEN static void *lru_gen_eviction(struct folio *folio) { int hist; unsigned long token; unsigned long min_seq; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); bool workingset = folio_test_workingset(folio); int tier = lru_tier_from_refs(refs, workingset); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); lruvec = mem_cgroup_lruvec(memcg, pgdat); lrugen = &lruvec->lrugen; min_seq = READ_ONCE(lrugen->min_seq[type]); token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); } /* * Tests if the shadow entry is for a folio that was recently evicted. * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { int memcg_id; unsigned long max_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat; unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset); memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; } static void lru_gen_refault(struct folio *folio, void *shadow) { bool recent; int hist, tier, refs; bool workingset; unsigned long token; struct lruvec *lruvec; struct lru_gen_folio *lrugen; int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); rcu_read_lock(); recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); if (lruvec != folio_lruvec(folio)) goto unlock; mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); if (!recent) goto unlock; lrugen = &lruvec->lrugen; hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; tier = lru_tier_from_refs(refs, workingset); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); /* see folio_add_lru() where folio_set_active() will be called */ if (lru_gen_in_fault()) mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); if (workingset) { folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } else set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } #else /* !CONFIG_LRU_GEN */ static void *lru_gen_eviction(struct folio *folio) { return NULL; } static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { return false; } static void lru_gen_refault(struct folio *folio, void *shadow) { } #endif /* CONFIG_LRU_GEN */ /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged * @nr_pages: the number of pages to count * * As in-memory pages are aged, non-resident pages need to be aged as * well, in order for the refault distances later on to be comparable * to the in-memory dimensions. This function allows reclaim and LRU * operations to drive the non-resident aging along in parallel. */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a * round-robin fashion. That means that each cgroup has an LRU * order that is composed of the LRU orders of its child * cgroups; and every page has an LRU position not just in the * cgroup that owns it, but in all of that group's ancestors. * * So when the physical inactive list of a leaf cgroup ages, * the virtual inactive lists of all its parents, including * the root cgroup's, age as well. */ do { atomic_long_add(nr_pages, &lruvec->nonresident_age); } while ((lruvec = parent_lruvec(lruvec))); } /** * workingset_eviction - note the eviction of a folio from memory * @target_memcg: the cgroup that is causing the reclaim * @folio: the folio being evicted * * Return: a shadow entry to be stored in @folio->mapping->i_pages in place * of the evicted @folio so that a later refault can be detected. */ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = folio_pgdat(folio); unsigned long eviction; struct lruvec *lruvec; int memcgid; /* Folio is fully exclusive and pins folio's memory cgroup pointer */ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (lru_gen_enabled()) return lru_gen_eviction(folio); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); eviction >>= bucket_order; workingset_age_nonresident(lruvec, folio_nr_pages(folio)); return pack_shadow(memcgid, pgdat, eviction, folio_test_workingset(folio)); } /** * workingset_test_recent - tests if the shadow entry is for a folio that was * recently evicted. Also fills in @workingset with the value unpacked from * shadow. * @shadow: the shadow entry to be tested. * @file: whether the corresponding folio is from the file lru. * @workingset: where the workingset value unpacked from shadow should * be stored. * @flush: whether to flush cgroup rstat. * * Return: true if the shadow is for a recently evicted folio; false otherwise. */ bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool flush) { struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; unsigned long workingset_size; unsigned long refault; int memcgid; struct pglist_data *pgdat; unsigned long eviction; if (lru_gen_enabled()) { bool recent; rcu_read_lock(); recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); rcu_read_unlock(); return recent; } rcu_read_lock(); unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; /* * Look up the memcg associated with the stored ID. It might * have been deleted since the folio's eviction. * * Note that in rare events the ID could have been recycled * for a new cgroup that refaults a shared folio. This is * impossible to tell from the available data. However, this * should be a rare and limited disturbance, and activations * are always speculative anyway. Ultimately, it's the aging * algorithm's job to shake out the minimum access frequency * for the active cache. * * XXX: On !CONFIG_MEMCG, this will always return NULL; it * would be better if the root_mem_cgroup existed in all * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_tryget(eviction_memcg)) eviction_memcg = NULL; rcu_read_unlock(); if (!mem_cgroup_disabled() && !eviction_memcg) return false; /* * Flush stats (and potentially sleep) outside the RCU read section. * * Note that workingset_test_recent() itself might be called in RCU read * section (for e.g, in cachestat) - these callers need to skip flushing * stats (via the flush argument). * * XXX: With per-memcg flushing and thresholding, is ratelimiting * still needed here? */ if (flush) mem_cgroup_flush_stats_ratelimited(eviction_memcg); eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the * nonresident_age to lap a shadow entry in the field, which * can then result in a false small refault distance, leading * to a false activation should this old entry actually * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. */ refault_distance = (refault - eviction) & EVICTION_MASK; /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if * all the memory was available to the workingset. Whether * workingset competition needs to consider anon or not depends * on having free swap space. */ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); if (!file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); } if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON); if (file) { workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_ANON); } } mem_cgroup_put(eviction_memcg); return refault_distance <= workingset_size; } /** * workingset_refault - Evaluate the refault of a previously evicted folio. * @folio: The freshly allocated replacement folio. * @shadow: Shadow entry of the evicted folio. * * Calculates and evaluates the refault distance of the previously * evicted folio in the context of the node and the memcg whose memory * pressure caused the eviction. */ void workingset_refault(struct folio *folio, void *shadow) { bool file = folio_is_file_lru(folio); struct pglist_data *pgdat; struct mem_cgroup *memcg; struct lruvec *lruvec; bool workingset; long nr; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (lru_gen_enabled()) { lru_gen_refault(folio, shadow); return; } /* * The activation decision for this folio is made at the level * where the eviction occurred, as that is where the LRU order * during folio reclaim is being determined. * * However, the cgroup that will own the folio is the one that * is actually experiencing the refault event. Make sure the folio is * locked to guarantee folio_memcg() stability throughout. */ nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); if (!workingset_test_recent(shadow, file, &workingset, true)) return; folio_set_active(folio); workingset_age_nonresident(lruvec, nr); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); /* Folio was active prior to eviction */ if (workingset) { folio_set_workingset(folio); /* * XXX: Move to folio_add_lru() when it supports new vs * putback */ lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } } /** * workingset_activation - note a page activation * @folio: Folio that is being activated. */ void workingset_activation(struct folio *folio) { /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. */ if (mem_cgroup_disabled() || folio_memcg_charged(folio)) workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); } /* * Shadow entries reflect the share of the working set that does not * fit into memory, so their number depends on the access pattern of * the workload. In most cases, they will refault or get reclaimed * along with the inode, but a (malicious) workload that streams * through files with a total size several times that of available * memory, while preventing the inodes from being reclaimed, can * create excessive amounts of shadow nodes. To keep a lid on this, * track shadow nodes and reclaim them when they grow way past the * point where they would still be useful. */ struct list_lru shadow_nodes; void workingset_update_node(struct xa_node *node) { struct address_space *mapping; struct page *page = virt_to_page(node); /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. * * Avoid acquiring the list_lru lock when the nodes are * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ mapping = container_of(node->array, struct address_space, i_pages); lockdep_assert_held(&mapping->i_pages.xa_lock); if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add_obj(&shadow_nodes, &node->private_list); __inc_node_page_state(page, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del_obj(&shadow_nodes, &node->private_list); __dec_node_page_state(page, WORKINGSET_NODES); } } } static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long max_nodes; unsigned long nodes; unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); if (!nodes) return SHRINK_EMPTY; /* * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. * * The size of the active list converges toward 100% of * overall page cache as memory grows, with only a tiny * inactive list. Assume the total cache size for that. * * Nodes might be sparsely populated, with only one shadow * entry in the extreme case. Obviously, we cannot keep one * node for every eligible shadow entry, so compromise on a * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ #ifdef CONFIG_MEMCG if (sc->memcg) { struct lruvec *lruvec; int i; mem_cgroup_flush_stats_ratelimited(sc->memcg); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, NR_LRU_BASE + i); pages += lruvec_page_state_local( lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; pages += lruvec_page_state_local( lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; } else #endif pages = node_present_pages(sc->nid); max_nodes = pages >> (XA_CHUNK_SHIFT - 3); if (nodes <= max_nodes) return 0; return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) __must_hold(lru->lock) { struct xa_node *node = container_of(item, struct xa_node, private_list); struct address_space *mapping; int ret; /* * Page cache insertions and deletions synchronously maintain * the shadow node LRU under the i_pages lock and the * &lru->lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the &lru->lock pins any * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the &lru->lock. */ mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { spin_unlock_irq(&lru->lock); ret = LRU_RETRY; goto out; } /* For page cache we need to hold i_lock */ if (mapping->host != NULL) { if (!spin_trylock(&mapping->host->i_lock)) { xa_unlock(&mapping->i_pages); spin_unlock_irq(&lru->lock); ret = LRU_RETRY; goto out; } } list_lru_isolate(lru, item); __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); spin_unlock(&lru->lock); /* * The nodes should only contain one or more shadow entries, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; xa_delete_node(node, workingset_update_node); __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); if (mapping->host != NULL) { if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } ret = LRU_REMOVED_RETRY; out: cond_resched(); return ret; } static unsigned long scan_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { /* list_lru lock nests inside the IRQ-safe i_pages lock */ return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, NULL); } /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe * i_pages lock. */ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { struct shrinker *workingset_shadow_shrinker; unsigned int timestamp_bits; unsigned int max_order; int ret = -ENOMEM; BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); /* * Calculate the eviction bucket size to cover the longest * actionable refault distance, which is currently half of * memory (totalram_pages/2). However, memory hotplug may add * some more pages at runtime, so keep working with up to * double the initial memory by using totalram_pages as-is. */ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; max_order = fls_long(totalram_pages() - 1); if (max_order > timestamp_bits) bucket_order = max_order - timestamp_bits; pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-shadow"); if (!workingset_shadow_shrinker) goto err; ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker, &shadow_nodes_key); if (ret) goto err_list_lru; workingset_shadow_shrinker->count_objects = count_shadow_nodes; workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; /* ->count reports only fully expendable nodes */ workingset_shadow_shrinker->seeks = 0; shrinker_register(workingset_shadow_shrinker); return 0; err_list_lru: shrinker_free(workingset_shadow_shrinker); err: return ret; } module_init(workingset_init); |
30 30 12 10 5 10 6 9 5 5 2 3 12 12 3 9 8 12 12 5 12 10 10 10 8 2 10 8 1 8 10 10 6 8 10 8 8 10 10 10 10 10 9 9 9 5 10 10 10 10 10 10 10 5 10 10 10 10 4 4 4 10 1 10 1 8 6 30 30 30 16 16 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk> * * Changes to meet Linux coding standards, and DCCP infrastructure fixes. * * Copyright (c) 2006 Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ /* * This implementation should follow RFC 4341 */ #include <linux/slab.h> #include "../feat.h" #include "ccid2.h" #ifdef CONFIG_IP_DCCP_CCID2_DEBUG static bool ccid2_debug; #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) #else #define ccid2_pr_debug(format, a...) #endif static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) { struct ccid2_seq *seqp; int i; /* check if we have space to preserve the pointer to the buffer */ if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) / sizeof(struct ccid2_seq *))) return -ENOMEM; /* allocate buffer and initialize linked list */ seqp = kmalloc_array(CCID2_SEQBUF_LEN, sizeof(struct ccid2_seq), gfp_any()); if (seqp == NULL) return -ENOMEM; for (i = 0; i < (CCID2_SEQBUF_LEN - 1); i++) { seqp[i].ccid2s_next = &seqp[i + 1]; seqp[i + 1].ccid2s_prev = &seqp[i]; } seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = seqp; seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; /* This is the first allocation. Initiate the head and tail. */ if (hc->tx_seqbufc == 0) hc->tx_seqh = hc->tx_seqt = seqp; else { /* link the existing list with the one we just created */ hc->tx_seqh->ccid2s_next = seqp; seqp->ccid2s_prev = hc->tx_seqh; hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt; } /* store the original pointer to the buffer so we can free it */ hc->tx_seqbuf[hc->tx_seqbufc] = seqp; hc->tx_seqbufc++; return 0; } static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) return CCID_PACKET_WILL_DEQUEUE_LATER; return CCID_PACKET_SEND_AT_ONCE; } static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) { u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2); /* * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always * acceptable since this causes starvation/deadlock whenever cwnd < 2. * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled). */ if (val == 0 || val > max_ratio) { DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); val = max_ratio; } dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO, min_t(u32, val, DCCPF_ACK_RATIO_MAX)); } static void ccid2_check_l_ack_ratio(struct sock *sk) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); /* * After a loss, idle period, application limited period, or RTO we * need to check that the ack ratio is still less than the congestion * window. Otherwise, we will send an entire congestion window of * packets and got no response because we haven't sent ack ratio * packets yet. * If the ack ratio does need to be reduced, we reduce it to half of * the congestion window (or 1 if that's zero) instead of to the * congestion window. This prevents problems if one ack is lost. */ if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd) ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U); } static void ccid2_change_l_seq_window(struct sock *sk, u64 val) { dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW, clamp_val(val, DCCPF_SEQ_WMIN, DCCPF_SEQ_WMAX)); } static void dccp_tasklet_schedule(struct sock *sk) { struct tasklet_struct *t = &dccp_sk(sk)->dccps_xmitlet; if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { sock_hold(sk); __tasklet_schedule(t); } } static void ccid2_hc_tx_rto_expire(struct timer_list *t) { struct ccid2_hc_tx_sock *hc = from_timer(hc, t, tx_rtotimer); struct sock *sk = hc->sk; const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5); goto out; } ccid2_pr_debug("RTO_EXPIRE\n"); if (sk->sk_state == DCCP_CLOSED) goto out; /* back-off timer */ hc->tx_rto <<= 1; if (hc->tx_rto > DCCP_RTO_MAX) hc->tx_rto = DCCP_RTO_MAX; /* adjust pipe, cwnd etc */ hc->tx_ssthresh = hc->tx_cwnd / 2; if (hc->tx_ssthresh < 2) hc->tx_ssthresh = 2; hc->tx_cwnd = 1; hc->tx_pipe = 0; /* clear state about stuff we sent */ hc->tx_seqt = hc->tx_seqh; hc->tx_packets_acked = 0; /* clear ack ratio state. */ hc->tx_rpseq = 0; hc->tx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); /* if we were blocked before, we may now send cwnd=1 packet */ if (sender_was_blocked) dccp_tasklet_schedule(sk); /* restart backed-off timer */ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); out: bh_unlock_sock(sk); sock_put(sk); } /* * Congestion window validation (RFC 2861). */ static bool ccid2_do_cwv = true; module_param(ccid2_do_cwv, bool, 0644); MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation"); /** * ccid2_update_used_window - Track how much of cwnd is actually used * @hc: socket to update window * @new_wnd: new window values to add into the filter * * This is done in addition to CWV. The sender needs to have an idea of how many * packets may be in flight, to set the local Sequence Window value accordingly * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the * maximum-used window. We use an EWMA low-pass filter to filter out noise. */ static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd) { hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4; } /* This borrows the code of tcp_cwnd_application_limited() */ static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); /* don't reduce cwnd below the initial window (IW) */ u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache), win_used = max(hc->tx_cwnd_used, init_win); if (win_used < hc->tx_cwnd) { hc->tx_ssthresh = max(hc->tx_ssthresh, (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2)); hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1; } hc->tx_cwnd_used = 0; hc->tx_cwnd_stamp = now; ccid2_check_l_ack_ratio(sk); } /* This borrows the code of tcp_cwnd_restart() */ static void ccid2_cwnd_restart(struct sock *sk, const u32 now) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); u32 cwnd = hc->tx_cwnd, restart_cwnd, iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache); s32 delta = now - hc->tx_lsndtime; hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2)); /* don't reduce cwnd below the initial window (IW) */ restart_cwnd = min(cwnd, iwnd); while ((delta -= hc->tx_rto) >= 0 && cwnd > restart_cwnd) cwnd >>= 1; hc->tx_cwnd = max(cwnd, restart_cwnd); hc->tx_cwnd_stamp = now; hc->tx_cwnd_used = 0; ccid2_check_l_ack_ratio(sk); } static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); const u32 now = ccid2_jiffies32; struct ccid2_seq *next; /* slow-start after idle periods (RFC 2581, RFC 2861) */ if (ccid2_do_cwv && !hc->tx_pipe && (s32)(now - hc->tx_lsndtime) >= hc->tx_rto) ccid2_cwnd_restart(sk, now); hc->tx_lsndtime = now; hc->tx_pipe += 1; /* see whether cwnd was fully used (RFC 2861), update expected window */ if (ccid2_cwnd_network_limited(hc)) { ccid2_update_used_window(hc, hc->tx_cwnd); hc->tx_cwnd_used = 0; hc->tx_cwnd_stamp = now; } else { if (hc->tx_pipe > hc->tx_cwnd_used) hc->tx_cwnd_used = hc->tx_pipe; ccid2_update_used_window(hc, hc->tx_cwnd_used); if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto) ccid2_cwnd_application_limited(sk, now); } hc->tx_seqh->ccid2s_seq = dp->dccps_gss; hc->tx_seqh->ccid2s_acked = 0; hc->tx_seqh->ccid2s_sent = now; next = hc->tx_seqh->ccid2s_next; /* check if we need to alloc more space */ if (next == hc->tx_seqt) { if (ccid2_hc_tx_alloc_seq(hc)) { DCCP_CRIT("packet history - out of memory!"); /* FIXME: find a more graceful way to bail out */ return; } next = hc->tx_seqh->ccid2s_next; BUG_ON(next == hc->tx_seqt); } hc->tx_seqh = next; ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe); /* * FIXME: The code below is broken and the variables have been removed * from the socket struct. The `ackloss' variable was always set to 0, * and with arsent there are several problems: * (i) it doesn't just count the number of Acks, but all sent packets; * (ii) it is expressed in # of packets, not # of windows, so the * comparison below uses the wrong formula: Appendix A of RFC 4341 * comes up with the number K = cwnd / (R^2 - R) of consecutive windows * of data with no lost or marked Ack packets. If arsent were the # of * consecutive Acks received without loss, then Ack Ratio needs to be * decreased by 1 when * arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2) * where cwnd / R is the number of Acks received per window of data * (cf. RFC 4341, App. A). The problems are that * - arsent counts other packets as well; * - the comparison uses a formula different from RFC 4341; * - computing a cubic/quadratic equation each time is too complicated. * Hence a different algorithm is needed. */ #if 0 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ hc->tx_arsent++; /* We had an ack loss in this window... */ if (hc->tx_ackloss) { if (hc->tx_arsent >= hc->tx_cwnd) { hc->tx_arsent = 0; hc->tx_ackloss = 0; } } else { /* No acks lost up to now... */ /* decrease ack ratio if enough packets were sent */ if (dp->dccps_l_ack_ratio > 1) { /* XXX don't calculate denominator each time */ int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - dp->dccps_l_ack_ratio; denom = hc->tx_cwnd * hc->tx_cwnd / denom; if (hc->tx_arsent >= denom) { ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); hc->tx_arsent = 0; } } else { /* we can't increase ack ratio further [1] */ hc->tx_arsent = 0; /* or maybe set it to cwnd*/ } } #endif sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); #ifdef CONFIG_IP_DCCP_CCID2_DEBUG do { struct ccid2_seq *seqp = hc->tx_seqt; while (seqp != hc->tx_seqh) { ccid2_pr_debug("out seq=%llu acked=%d time=%u\n", (unsigned long long)seqp->ccid2s_seq, seqp->ccid2s_acked, seqp->ccid2s_sent); seqp = seqp->ccid2s_next; } } while (0); ccid2_pr_debug("=========\n"); #endif } /** * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm * @sk: socket to perform estimator on * @mrtt: measured RTT * * This code is almost identical with TCP's tcp_rtt_estimator(), since * - it has a higher sampling frequency (recommended by RFC 1323), * - the RTO does not collapse into RTT due to RTTVAR going towards zero, * - it is simple (cf. more complex proposals such as Eifel timer or research * which suggests that the gain should be set according to window size), * - in tests it was found to work well with CCID2 [gerrit]. */ static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); long m = mrtt ? : 1; if (hc->tx_srtt == 0) { /* First measurement m */ hc->tx_srtt = m << 3; hc->tx_mdev = m << 1; hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk)); hc->tx_rttvar = hc->tx_mdev_max; hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; } else { /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ m -= (hc->tx_srtt >> 3); hc->tx_srtt += m; /* Similarly, update scaled mdev with regard to |m| */ if (m < 0) { m = -m; m -= (hc->tx_mdev >> 2); /* * This neutralises RTO increase when RTT < SRTT - mdev * (see P. Sarolahti, A. Kuznetsov,"Congestion Control * in Linux TCP", USENIX 2002, pp. 49-62). */ if (m > 0) m >>= 3; } else { m -= (hc->tx_mdev >> 2); } hc->tx_mdev += m; if (hc->tx_mdev > hc->tx_mdev_max) { hc->tx_mdev_max = hc->tx_mdev; if (hc->tx_mdev_max > hc->tx_rttvar) hc->tx_rttvar = hc->tx_mdev_max; } /* * Decay RTTVAR at most once per flight, exploiting that * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) * GAR is a useful bound for FlightSize = pipe. * AWL is probably too low here, as it over-estimates pipe. */ if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) { if (hc->tx_mdev_max < hc->tx_rttvar) hc->tx_rttvar -= (hc->tx_rttvar - hc->tx_mdev_max) >> 2; hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; hc->tx_mdev_max = tcp_rto_min(sk); } } /* * Set RTO from SRTT and RTTVAR * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms. * This agrees with RFC 4341, 5: * "Because DCCP does not retransmit data, DCCP does not require * TCP's recommended minimum timeout of one second". */ hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar; if (hc->tx_rto > DCCP_RTO_MAX) hc->tx_rto = DCCP_RTO_MAX; } static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, unsigned int *maxincr) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); struct dccp_sock *dp = dccp_sk(sk); int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio; if (hc->tx_cwnd < dp->dccps_l_seq_win && r_seq_used < dp->dccps_r_seq_win) { if (hc->tx_cwnd < hc->tx_ssthresh) { if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) { hc->tx_cwnd += 1; *maxincr -= 1; hc->tx_packets_acked = 0; } } else if (++hc->tx_packets_acked >= hc->tx_cwnd) { hc->tx_cwnd += 1; hc->tx_packets_acked = 0; } } /* * Adjust the local sequence window and the ack ratio to allow about * 5 times the number of packets in the network (RFC 4340 7.5.2) */ if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win) ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2); else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2) ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U); if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win) ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2); else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2) ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2); /* * FIXME: RTT is sampled several times per acknowledgment (for each * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). * This causes the RTT to be over-estimated, since the older entries * in the Ack Vector have earlier sending times. * The cleanest solution is to not use the ccid2s_sent field at all * and instead use DCCP timestamps: requires changes in other places. */ ccid2_rtt_estimator(sk, ccid2_jiffies32 - seqp->ccid2s_sent); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) { ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); return; } hc->tx_last_cong = ccid2_jiffies32; hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; hc->tx_ssthresh = max(hc->tx_cwnd, 2U); ccid2_check_l_ack_ratio(sk); } static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, u8 option, u8 *optval, u8 optlen) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); switch (option) { case DCCPO_ACK_VECTOR_0: case DCCPO_ACK_VECTOR_1: return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen, option - DCCPO_ACK_VECTOR_0); } return 0; } static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); struct dccp_ackvec_parsed *avp; u64 ackno, seqno; struct ccid2_seq *seqp; int done = 0; unsigned int maxincr = 0; /* check reverse path congestion */ seqno = DCCP_SKB_CB(skb)->dccpd_seq; /* XXX this whole "algorithm" is broken. Need to fix it to keep track * of the seqnos of the dupacks so that rpseq and rpdupack are correct * -sorbo. */ /* need to bootstrap */ if (hc->tx_rpdupack == -1) { hc->tx_rpdupack = 0; hc->tx_rpseq = seqno; } else { /* check if packet is consecutive */ if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1) hc->tx_rpseq = seqno; /* it's a later packet */ else if (after48(seqno, hc->tx_rpseq)) { hc->tx_rpdupack++; /* check if we got enough dupacks */ if (hc->tx_rpdupack >= NUMDUPACK) { hc->tx_rpdupack = -1; /* XXX lame */ hc->tx_rpseq = 0; #ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__ /* * FIXME: Ack Congestion Control is broken; in * the current state instabilities occurred with * Ack Ratios greater than 1; causing hang-ups * and long RTO timeouts. This needs to be fixed * before opening up dynamic changes. -- gerrit */ ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); #endif } } } /* check forward path congestion */ if (dccp_packet_without_ack(skb)) return; /* still didn't send out new data packets */ if (hc->tx_seqh == hc->tx_seqt) goto done; ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; if (after48(ackno, hc->tx_high_ack)) hc->tx_high_ack = ackno; seqp = hc->tx_seqt; while (before48(seqp->ccid2s_seq, ackno)) { seqp = seqp->ccid2s_next; if (seqp == hc->tx_seqh) { seqp = hc->tx_seqh->ccid2s_prev; break; } } /* * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2 * packets per acknowledgement. Rounding up avoids that cwnd is not * advanced when Ack Ratio is 1 and gives a slight edge otherwise. */ if (hc->tx_cwnd < hc->tx_ssthresh) maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); /* go through all ack vectors */ list_for_each_entry(avp, &hc->tx_av_chunks, node) { /* go through this ack vector */ for (; avp->len--; avp->vec++) { u64 ackno_end_rl = SUB48(ackno, dccp_ackvec_runlen(avp->vec)); ccid2_pr_debug("ackvec %llu |%u,%u|\n", (unsigned long long)ackno, dccp_ackvec_state(avp->vec) >> 6, dccp_ackvec_runlen(avp->vec)); /* if the seqno we are analyzing is larger than the * current ackno, then move towards the tail of our * seqnos. */ while (after48(seqp->ccid2s_seq, ackno)) { if (seqp == hc->tx_seqt) { done = 1; break; } seqp = seqp->ccid2s_prev; } if (done) break; /* check all seqnos in the range of the vector * run length */ while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { const u8 state = dccp_ackvec_state(avp->vec); /* new packet received or marked */ if (state != DCCPAV_NOT_RECEIVED && !seqp->ccid2s_acked) { if (state == DCCPAV_ECN_MARKED) ccid2_congestion_event(sk, seqp); else ccid2_new_ack(sk, seqp, &maxincr); seqp->ccid2s_acked = 1; ccid2_pr_debug("Got ack for %llu\n", (unsigned long long)seqp->ccid2s_seq); hc->tx_pipe--; } if (seqp == hc->tx_seqt) { done = 1; break; } seqp = seqp->ccid2s_prev; } if (done) break; ackno = SUB48(ackno_end_rl, 1); } if (done) break; } /* The state about what is acked should be correct now * Check for NUMDUPACK */ seqp = hc->tx_seqt; while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) { seqp = seqp->ccid2s_next; if (seqp == hc->tx_seqh) { seqp = hc->tx_seqh->ccid2s_prev; break; } } done = 0; while (1) { if (seqp->ccid2s_acked) { done++; if (done == NUMDUPACK) break; } if (seqp == hc->tx_seqt) break; seqp = seqp->ccid2s_prev; } /* If there are at least 3 acknowledgements, anything unacknowledged * below the last sequence number is considered lost */ if (done == NUMDUPACK) { struct ccid2_seq *last_acked = seqp; /* check for lost packets */ while (1) { if (!seqp->ccid2s_acked) { ccid2_pr_debug("Packet lost: %llu\n", (unsigned long long)seqp->ccid2s_seq); /* XXX need to traverse from tail -> head in * order to detect multiple congestion events in * one ack vector. */ ccid2_congestion_event(sk, seqp); hc->tx_pipe--; } if (seqp == hc->tx_seqt) break; seqp = seqp->ccid2s_prev; } hc->tx_seqt = last_acked; } /* trim acked packets in tail */ while (hc->tx_seqt != hc->tx_seqh) { if (!hc->tx_seqt->ccid2s_acked) break; hc->tx_seqt = hc->tx_seqt->ccid2s_next; } /* restart RTO timer if not all outstanding data has been acked */ if (hc->tx_pipe == 0) sk_stop_timer(sk, &hc->tx_rtotimer); else sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); done: /* check if incoming Acks allow pending packets to be sent */ if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) dccp_tasklet_schedule(sk); dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) { struct ccid2_hc_tx_sock *hc = ccid_priv(ccid); struct dccp_sock *dp = dccp_sk(sk); u32 max_ratio; /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ hc->tx_ssthresh = ~0U; /* Use larger initial windows (RFC 4341, section 5). */ hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); hc->tx_expected_wnd = hc->tx_cwnd; /* Make sure that Ack Ratio is enabled and within bounds. */ max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) dp->dccps_l_ack_ratio = max_ratio; /* XXX init ~ to window size... */ if (ccid2_hc_tx_alloc_seq(hc)) return -ENOMEM; hc->tx_rto = DCCP_TIMEOUT_INIT; hc->tx_rpdupack = -1; hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32; hc->tx_cwnd_used = 0; hc->sk = sk; timer_setup(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 0); INIT_LIST_HEAD(&hc->tx_av_chunks); return 0; } static void ccid2_hc_tx_exit(struct sock *sk) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); int i; sk_stop_timer(sk, &hc->tx_rtotimer); for (i = 0; i < hc->tx_seqbufc; i++) kfree(hc->tx_seqbuf[i]); hc->tx_seqbufc = 0; dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); } static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk); if (!dccp_data_packet(skb)) return; if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) { dccp_send_ack(sk); hc->rx_num_data_pkts = 0; } } struct ccid_operations ccid2_ops = { .ccid_id = DCCPC_CCID2, .ccid_name = "TCP-like", .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), .ccid_hc_tx_init = ccid2_hc_tx_init, .ccid_hc_tx_exit = ccid2_hc_tx_exit, .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, }; #ifdef CONFIG_IP_DCCP_CCID2_DEBUG module_param(ccid2_debug, bool, 0644); MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages"); #endif |
174 37 165 164 125 122 121 125 165 165 165 165 165 165 174 69 165 165 129 154 185 152 129 129 129 118 154 154 125 185 16 154 184 178 178 127 185 95 96 12 12 129 128 129 129 95 96 129 67 67 67 5 5 5 37 37 37 35 97 97 97 97 88 97 97 92 97 92 92 92 92 92 92 92 97 97 97 4070 3951 92 97 97 171 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 | // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. #include <linux/ethtool_netlink.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/udp_tunnel.h> #include <net/vxlan.h> enum udp_tunnel_nic_table_entry_flags { UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0), UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1), UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2), UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3), }; struct udp_tunnel_nic_table_entry { __be16 port; u8 type; u8 flags; u16 use_cnt; #define UDP_TUNNEL_NIC_USE_CNT_MAX U16_MAX u8 hw_priv; }; /** * struct udp_tunnel_nic - UDP tunnel port offload state * @work: async work for talking to hardware from process context * @dev: netdev pointer * @need_sync: at least one port start changed * @need_replay: space was freed, we need a replay of all ports * @work_pending: @work is currently scheduled * @n_tables: number of tables under @entries * @missed: bitmap of tables which overflown * @entries: table of tables of ports currently offloaded */ struct udp_tunnel_nic { struct work_struct work; struct net_device *dev; u8 need_sync:1; u8 need_replay:1; u8 work_pending:1; unsigned int n_tables; unsigned long missed; struct udp_tunnel_nic_table_entry *entries[] __counted_by(n_tables); }; /* We ensure all work structs are done using driver state, but not the code. * We need a workqueue we can flush before module gets removed. */ static struct workqueue_struct *udp_tunnel_nic_workqueue; static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type) { switch (type) { case UDP_TUNNEL_TYPE_VXLAN: return "vxlan"; case UDP_TUNNEL_TYPE_GENEVE: return "geneve"; case UDP_TUNNEL_TYPE_VXLAN_GPE: return "vxlan-gpe"; default: return "unknown"; } } static bool udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt == 0 && !entry->flags; } static bool udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); } static bool udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry) { if (!udp_tunnel_nic_entry_is_free(entry)) entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry) { entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN; } static bool udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD | UDP_TUNNEL_NIC_ENTRY_DEL); } static void udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn, struct udp_tunnel_nic_table_entry *entry, unsigned int flag) { entry->flags |= flag; utn->need_sync = 1; } static void udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry, struct udp_tunnel_info *ti) { memset(ti, 0, sizeof(*ti)); ti->port = entry->port; ti->type = entry->type; ti->hw_priv = entry->hw_priv; } static bool udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return false; return true; } static bool udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; if (!utn->missed) return false; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!test_bit(i, &utn->missed)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return true; } return false; } static void __udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; entry = &utn->entries[table][idx]; if (entry->use_cnt) udp_tunnel_nic_ti_from_entry(entry, ti); } static void __udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv; } static void udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry, int err) { bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && (!err || (err == -EEXIST && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD; if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL && (!err || (err == -ENOENT && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL; if (!err) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL; else entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL; } static void udp_tunnel_nic_device_sync_one(struct net_device *dev, struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_info ti; int err; entry = &utn->entries[table][idx]; if (!udp_tunnel_nic_entry_is_queued(entry)) return; udp_tunnel_nic_ti_from_entry(entry, &ti); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD) err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti); else err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx, &ti); udp_tunnel_nic_entry_update_done(entry, err); if (err) netdev_warn(dev, "UDP tunnel port sync failed port %d type %s: %d\n", be16_to_cpu(entry->port), udp_tunnel_nic_tunnel_type_name(entry->type), err); } static void udp_tunnel_nic_device_sync_by_port(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_device_sync_one(dev, utn, i, j); } static void udp_tunnel_nic_device_sync_by_table(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; int err; for (i = 0; i < utn->n_tables; i++) { /* Find something that needs sync in this table */ for (j = 0; j < info->tables[i].n_entries; j++) if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j])) break; if (j == info->tables[i].n_entries) continue; err = info->sync_table(dev, i); if (err) netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n", i, err); for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (udp_tunnel_nic_entry_is_queued(entry)) udp_tunnel_nic_entry_update_done(entry, err); } } } static void __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { if (!utn->need_sync) return; if (dev->udp_tunnel_nic_info->sync_table) udp_tunnel_nic_device_sync_by_table(dev, utn); else udp_tunnel_nic_device_sync_by_port(dev, utn); utn->need_sync = 0; /* Can't replay directly here, in case we come from the tunnel driver's * notification - trying to replay may deadlock inside tunnel driver. */ utn->need_replay = udp_tunnel_nic_should_replay(dev, utn); } static void udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; bool may_sleep; if (!utn->need_sync) return; /* Drivers which sleep in the callback need to update from * the workqueue, if we come from the tunnel driver's notification. */ may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP; if (!may_sleep) __udp_tunnel_nic_device_sync(dev, utn); if (may_sleep || utn->need_replay) { queue_work(udp_tunnel_nic_workqueue, &utn->work); utn->work_pending = 1; } } static bool udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table, struct udp_tunnel_info *ti) { return table->tunnel_types & ti->type; } static bool udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i; /* Special case IPv4-only NICs */ if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY && ti->sa_family != AF_INET) return false; for (i = 0; i < utn->n_tables; i++) if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti)) return true; return false; } static int udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_table_entry *entry; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry) && entry->port == ti->port && entry->type != ti->type) { __set_bit(i, &utn->missed); return true; } } return false; } static void udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; unsigned int from, to; WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX); /* If not going from used to unused or vice versa - all done. * For dodgy entries make sure we try to sync again (queue the entry). */ entry->use_cnt += use_cnt_adj; if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj)) return; /* Cancel the op before it was sent to the device, if possible, * otherwise we'd need to take special care to issue commands * in the same order the ports arrived. */ if (use_cnt_adj < 0) { from = UDP_TUNNEL_NIC_ENTRY_ADD; to = UDP_TUNNEL_NIC_ENTRY_DEL; } else { from = UDP_TUNNEL_NIC_ENTRY_DEL; to = UDP_TUNNEL_NIC_ENTRY_ADD; } if (entry->flags & from) { entry->flags &= ~from; if (!dodgy) return; } udp_tunnel_nic_entry_queue(utn, entry, to); } static bool udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; if (udp_tunnel_nic_entry_is_free(entry) || entry->port != ti->port || entry->type != ti->type) return false; if (udp_tunnel_nic_entry_is_frozen(entry)) return true; udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj); return true; } /* Try to find existing matching entry and adjust its use count, instead of * adding a new one. Returns true if entry was found. In case of delete the * entry may have gotten removed in the process, in which case it will be * queued for removal. */ static bool udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti, int use_cnt_adj) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti, use_cnt_adj)) return true; } return false; } static bool udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, +1); } static bool udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, -1); } static bool udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry)) continue; entry->port = ti->port; entry->type = ti->type; entry->use_cnt = 1; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); return true; } /* The different table may still fit this port in, but there * are no devices currently which have multiple tables accepting * the same tunnel type, and false positives are okay. */ __set_bit(i, &utn->missed); } return false; } static void __udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY) return; if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN && ti->port == htons(IANA_VXLAN_UDP_PORT)) { if (ti->type != UDP_TUNNEL_TYPE_VXLAN) netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n"); return; } if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; /* It may happen that a tunnel of one type is removed and different * tunnel type tries to reuse its port before the device was informed. * Rely on utn->missed to re-add this port later. */ if (udp_tunnel_nic_has_collision(dev, utn, ti)) return; if (!udp_tunnel_nic_add_existing(dev, utn, ti)) udp_tunnel_nic_add_new(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) { struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; udp_tunnel_nic_del_existing(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int i, j; ASSERT_RTNL(); utn = dev->udp_tunnel_nic; if (!utn) return; utn->need_sync = false; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | UDP_TUNNEL_NIC_ENTRY_OP_FAIL); /* We don't release rtnl across ops */ WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); if (!entry->use_cnt) continue; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); } __udp_tunnel_nic_device_sync(dev, utn); } static size_t __udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int j; size_t size; utn = dev->udp_tunnel_nic; if (!utn) return 0; size = 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; size += nla_total_size(0) + /* _TABLE_ENTRY */ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ } return size; } static int __udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; struct nlattr *nest; unsigned int j; utn = dev->udp_tunnel_nic; if (!utn) return 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); if (!nest) return -EMSGSIZE; if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, utn->entries[table][j].port) || nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, ilog2(utn->entries[table][j].type))) goto err_cancel; nla_nest_end(skb, nest); } return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .get_port = __udp_tunnel_nic_get_port, .set_port_priv = __udp_tunnel_nic_set_port_priv, .add_port = __udp_tunnel_nic_add_port, .del_port = __udp_tunnel_nic_del_port, .reset_ntf = __udp_tunnel_nic_reset_ntf, .dump_size = __udp_tunnel_nic_dump_size, .dump_write = __udp_tunnel_nic_dump_write, }; static void udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { int adj_cnt = -utn->entries[i][j].use_cnt; if (adj_cnt) udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt); } __udp_tunnel_nic_device_sync(dev, utn); for (i = 0; i < utn->n_tables; i++) memset(utn->entries[i], 0, array_size(info->tables[i].n_entries, sizeof(**utn->entries))); WARN_ON(utn->need_sync); utn->need_replay = 0; } static void udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node; unsigned int i, j; /* Freeze all the ports we are already tracking so that the replay * does not double up the refcount. */ for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]); utn->missed = 0; utn->need_replay = 0; if (!info->shared) { udp_tunnel_get_rx_info(dev); } else { list_for_each_entry(node, &info->shared->devices, list) udp_tunnel_get_rx_info(node->dev); } for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]); } static void udp_tunnel_nic_device_sync_work(struct work_struct *work) { struct udp_tunnel_nic *utn = container_of(work, struct udp_tunnel_nic, work); rtnl_lock(); utn->work_pending = 0; __udp_tunnel_nic_device_sync(utn->dev, utn); if (utn->need_replay) udp_tunnel_nic_replay(utn->dev, utn); rtnl_unlock(); } static struct udp_tunnel_nic * udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, unsigned int n_tables) { struct udp_tunnel_nic *utn; unsigned int i; utn = kzalloc(struct_size(utn, entries, n_tables), GFP_KERNEL); if (!utn) return NULL; utn->n_tables = n_tables; INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); for (i = 0; i < n_tables; i++) { utn->entries[i] = kcalloc(info->tables[i].n_entries, sizeof(*utn->entries[i]), GFP_KERNEL); if (!utn->entries[i]) goto err_free_prev_entries; } return utn; err_free_prev_entries: while (i--) kfree(utn->entries[i]); kfree(utn); return NULL; } static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn) { unsigned int i; for (i = 0; i < utn->n_tables; i++) kfree(utn->entries[i]); kfree(utn); } static int udp_tunnel_nic_register(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node = NULL; struct udp_tunnel_nic *utn; unsigned int n_tables, i; BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE < UDP_TUNNEL_NIC_MAX_TABLES); /* Expect use count of at most 2 (IPv4, IPv6) per device */ BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX < UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2); /* Check that the driver info is sane */ if (WARN_ON(!info->set_port != !info->unset_port) || WARN_ON(!info->set_port == !info->sync_table) || WARN_ON(!info->tables[0].n_entries)) return -EINVAL; if (WARN_ON(info->shared && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return -EINVAL; n_tables = 1; for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { if (!info->tables[i].n_entries) continue; n_tables++; if (WARN_ON(!info->tables[i - 1].n_entries)) return -EINVAL; } /* Create UDP tunnel state structures */ if (info->shared) { node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; node->dev = dev; } if (info->shared && info->shared->udp_tunnel_nic_info) { utn = info->shared->udp_tunnel_nic_info; } else { utn = udp_tunnel_nic_alloc(info, n_tables); if (!utn) { kfree(node); return -ENOMEM; } } if (info->shared) { if (!info->shared->udp_tunnel_nic_info) { INIT_LIST_HEAD(&info->shared->devices); info->shared->udp_tunnel_nic_info = utn; } list_add_tail(&node->list, &info->shared->devices); } utn->dev = dev; dev_hold(dev); dev->udp_tunnel_nic = utn; if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) udp_tunnel_get_rx_info(dev); return 0; } static void udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; /* For a shared table remove this dev from the list of sharing devices * and if there are other devices just detach. */ if (info->shared) { struct udp_tunnel_nic_shared_node *node, *first; list_for_each_entry(node, &info->shared->devices, list) if (node->dev == dev) break; if (list_entry_is_head(node, &info->shared->devices, list)) return; list_del(&node->list); kfree(node); first = list_first_entry_or_null(&info->shared->devices, typeof(*first), list); if (first) { udp_tunnel_drop_rx_info(dev); utn->dev = first->dev; goto release_dev; } info->shared->udp_tunnel_nic_info = NULL; } /* Flush before we check work, so we don't waste time adding entries * from the work which we will boot immediately. */ udp_tunnel_nic_flush(dev, utn); /* Wait for the work to be done using the state, netdev core will * retry unregister until we give up our reference on this device. */ if (utn->work_pending) return; udp_tunnel_nic_free(utn); release_dev: dev->udp_tunnel_nic = NULL; dev_put(dev); } static int udp_tunnel_nic_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); const struct udp_tunnel_nic_info *info; struct udp_tunnel_nic *utn; info = dev->udp_tunnel_nic_info; if (!info) return NOTIFY_DONE; if (event == NETDEV_REGISTER) { int err; err = udp_tunnel_nic_register(dev); if (err) netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); return notifier_from_errno(err); } /* All other events will need the udp_tunnel_nic state */ utn = dev->udp_tunnel_nic; if (!utn) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { udp_tunnel_nic_unregister(dev, utn); return NOTIFY_OK; } /* All other events only matter if NIC has to be programmed open */ if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return NOTIFY_DONE; if (event == NETDEV_UP) { WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); udp_tunnel_get_rx_info(dev); return NOTIFY_OK; } if (event == NETDEV_GOING_DOWN) { udp_tunnel_nic_flush(dev, utn); return NOTIFY_OK; } return NOTIFY_DONE; } static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = { .notifier_call = udp_tunnel_nic_netdevice_event, }; static int __init udp_tunnel_nic_init_module(void) { int err; udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0); if (!udp_tunnel_nic_workqueue) return -ENOMEM; rtnl_lock(); udp_tunnel_nic_ops = &__udp_tunnel_nic_ops; rtnl_unlock(); err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block); if (err) goto err_unset_ops; return 0; err_unset_ops: rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); return err; } late_initcall(udp_tunnel_nic_init_module); static void __exit udp_tunnel_nic_cleanup_module(void) { unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block); rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); } module_exit(udp_tunnel_nic_cleanup_module); MODULE_LICENSE("GPL"); |
11 12 106 106 106 106 12 18 1 3 1 2 2 3 3 1 1 1 3 3 36 36 1 1 5 4 1 5 4 4 6 6 6 6 24 1 4 7 8 7 6 4 7 4 1 6 8 2 2 4 4 4 13 1 12 12 62 62 3 18 18 16 2 21 128 128 2 6 8 69 68 69 76 85 1 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 | // SPDX-License-Identifier: GPL-2.0-or-later /* * CALIPSO - Common Architecture Label IPv6 Security Option * * This is an implementation of the CALIPSO protocol as specified in * RFC 5570. * * Authors: Paul Moore <paul.moore@hp.com> * Huw Davies <huw@codeweavers.com> */ /* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015 */ #include <linux/init.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/jhash.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/netlabel.h> #include <net/calipso.h> #include <linux/atomic.h> #include <linux/bug.h> #include <linux/unaligned.h> #include <linux/crc-ccitt.h> /* Maximium size of the calipso option including * the two-byte TLV header. */ #define CALIPSO_OPT_LEN_MAX (2 + 252) /* Size of the minimum calipso option including * the two-byte TLV header. */ #define CALIPSO_HDR_LEN (2 + 8) /* Maximium size of the calipso option including * the two-byte TLV header and upto 3 bytes of * leading pad and 7 bytes of trailing pad. */ #define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7) /* Maximium size of u32 aligned buffer required to hold calipso * option. Max of 3 initial pad bytes starting from buffer + 3. * i.e. the worst case is when the previous tlv finishes on 4n + 3. */ #define CALIPSO_MAX_BUFFER (6 + CALIPSO_OPT_LEN_MAX) /* List of available DOI definitions */ static DEFINE_SPINLOCK(calipso_doi_list_lock); static LIST_HEAD(calipso_doi_list); /* Label mapping cache */ int calipso_cache_enabled = 1; int calipso_cache_bucketsize = 10; #define CALIPSO_CACHE_BUCKETBITS 7 #define CALIPSO_CACHE_BUCKETS BIT(CALIPSO_CACHE_BUCKETBITS) #define CALIPSO_CACHE_REORDERLIMIT 10 struct calipso_map_cache_bkt { spinlock_t lock; u32 size; struct list_head list; }; struct calipso_map_cache_entry { u32 hash; unsigned char *key; size_t key_len; struct netlbl_lsm_cache *lsm_data; u32 activity; struct list_head list; }; static struct calipso_map_cache_bkt *calipso_cache; static void calipso_cache_invalidate(void); static void calipso_doi_putdef(struct calipso_doi *doi_def); /* Label Mapping Cache Functions */ /** * calipso_cache_entry_free - Frees a cache entry * @entry: the entry to free * * Description: * This function frees the memory associated with a cache entry including the * LSM cache data if there are no longer any users, i.e. reference count == 0. * */ static void calipso_cache_entry_free(struct calipso_map_cache_entry *entry) { if (entry->lsm_data) netlbl_secattr_cache_free(entry->lsm_data); kfree(entry->key); kfree(entry); } /** * calipso_map_cache_hash - Hashing function for the CALIPSO cache * @key: the hash key * @key_len: the length of the key in bytes * * Description: * The CALIPSO tag hashing function. Returns a 32-bit hash value. * */ static u32 calipso_map_cache_hash(const unsigned char *key, u32 key_len) { return jhash(key, key_len, 0); } /** * calipso_cache_init - Initialize the CALIPSO cache * * Description: * Initializes the CALIPSO label mapping cache, this function should be called * before any of the other functions defined in this file. Returns zero on * success, negative values on error. * */ static int __init calipso_cache_init(void) { u32 iter; calipso_cache = kcalloc(CALIPSO_CACHE_BUCKETS, sizeof(struct calipso_map_cache_bkt), GFP_KERNEL); if (!calipso_cache) return -ENOMEM; for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { spin_lock_init(&calipso_cache[iter].lock); calipso_cache[iter].size = 0; INIT_LIST_HEAD(&calipso_cache[iter].list); } return 0; } /** * calipso_cache_invalidate - Invalidates the current CALIPSO cache * * Description: * Invalidates and frees any entries in the CALIPSO cache. Returns zero on * success and negative values on failure. * */ static void calipso_cache_invalidate(void) { struct calipso_map_cache_entry *entry, *tmp_entry; u32 iter; for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { spin_lock_bh(&calipso_cache[iter].lock); list_for_each_entry_safe(entry, tmp_entry, &calipso_cache[iter].list, list) { list_del(&entry->list); calipso_cache_entry_free(entry); } calipso_cache[iter].size = 0; spin_unlock_bh(&calipso_cache[iter].lock); } } /** * calipso_cache_check - Check the CALIPSO cache for a label mapping * @key: the buffer to check * @key_len: buffer length in bytes * @secattr: the security attribute struct to use * * Description: * This function checks the cache to see if a label mapping already exists for * the given key. If there is a match then the cache is adjusted and the * @secattr struct is populated with the correct LSM security attributes. The * cache is adjusted in the following manner if the entry is not already the * first in the cache bucket: * * 1. The cache entry's activity counter is incremented * 2. The previous (higher ranking) entry's activity counter is decremented * 3. If the difference between the two activity counters is geater than * CALIPSO_CACHE_REORDERLIMIT the two entries are swapped * * Returns zero on success, -ENOENT for a cache miss, and other negative values * on error. * */ static int calipso_cache_check(const unsigned char *key, u32 key_len, struct netlbl_lsm_secattr *secattr) { u32 bkt; struct calipso_map_cache_entry *entry; struct calipso_map_cache_entry *prev_entry = NULL; u32 hash; if (!calipso_cache_enabled) return -ENOENT; hash = calipso_map_cache_hash(key, key_len); bkt = hash & (CALIPSO_CACHE_BUCKETS - 1); spin_lock_bh(&calipso_cache[bkt].lock); list_for_each_entry(entry, &calipso_cache[bkt].list, list) { if (entry->hash == hash && entry->key_len == key_len && memcmp(entry->key, key, key_len) == 0) { entry->activity += 1; refcount_inc(&entry->lsm_data->refcount); secattr->cache = entry->lsm_data; secattr->flags |= NETLBL_SECATTR_CACHE; secattr->type = NETLBL_NLTYPE_CALIPSO; if (!prev_entry) { spin_unlock_bh(&calipso_cache[bkt].lock); return 0; } if (prev_entry->activity > 0) prev_entry->activity -= 1; if (entry->activity > prev_entry->activity && entry->activity - prev_entry->activity > CALIPSO_CACHE_REORDERLIMIT) { __list_del(entry->list.prev, entry->list.next); __list_add(&entry->list, prev_entry->list.prev, &prev_entry->list); } spin_unlock_bh(&calipso_cache[bkt].lock); return 0; } prev_entry = entry; } spin_unlock_bh(&calipso_cache[bkt].lock); return -ENOENT; } /** * calipso_cache_add - Add an entry to the CALIPSO cache * @calipso_ptr: the CALIPSO option * @secattr: the packet's security attributes * * Description: * Add a new entry into the CALIPSO label mapping cache. Add the new entry to * head of the cache bucket's list, if the cache bucket is out of room remove * the last entry in the list first. It is important to note that there is * currently no checking for duplicate keys. Returns zero on success, * negative values on failure. The key stored starts at calipso_ptr + 2, * i.e. the type and length bytes are not stored, this corresponds to * calipso_ptr[1] bytes of data. * */ static int calipso_cache_add(const unsigned char *calipso_ptr, const struct netlbl_lsm_secattr *secattr) { int ret_val = -EPERM; u32 bkt; struct calipso_map_cache_entry *entry = NULL; struct calipso_map_cache_entry *old_entry = NULL; u32 calipso_ptr_len; if (!calipso_cache_enabled || calipso_cache_bucketsize <= 0) return 0; calipso_ptr_len = calipso_ptr[1]; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; entry->key = kmemdup(calipso_ptr + 2, calipso_ptr_len, GFP_ATOMIC); if (!entry->key) { ret_val = -ENOMEM; goto cache_add_failure; } entry->key_len = calipso_ptr_len; entry->hash = calipso_map_cache_hash(calipso_ptr, calipso_ptr_len); refcount_inc(&secattr->cache->refcount); entry->lsm_data = secattr->cache; bkt = entry->hash & (CALIPSO_CACHE_BUCKETS - 1); spin_lock_bh(&calipso_cache[bkt].lock); if (calipso_cache[bkt].size < calipso_cache_bucketsize) { list_add(&entry->list, &calipso_cache[bkt].list); calipso_cache[bkt].size += 1; } else { old_entry = list_entry(calipso_cache[bkt].list.prev, struct calipso_map_cache_entry, list); list_del(&old_entry->list); list_add(&entry->list, &calipso_cache[bkt].list); calipso_cache_entry_free(old_entry); } spin_unlock_bh(&calipso_cache[bkt].lock); return 0; cache_add_failure: if (entry) calipso_cache_entry_free(entry); return ret_val; } /* DOI List Functions */ /** * calipso_doi_search - Searches for a DOI definition * @doi: the DOI to search for * * Description: * Search the DOI definition list for a DOI definition with a DOI value that * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). * Returns a pointer to the DOI definition on success and NULL on failure. */ static struct calipso_doi *calipso_doi_search(u32 doi) { struct calipso_doi *iter; list_for_each_entry_rcu(iter, &calipso_doi_list, list) if (iter->doi == doi && refcount_read(&iter->refcount)) return iter; return NULL; } /** * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine * @doi_def: the DOI structure * @audit_info: NetLabel audit information * * Description: * The caller defines a new DOI for use by the CALIPSO engine and calls this * function to add it to the list of acceptable domains. The caller must * ensure that the mapping table specified in @doi_def->map meets all of the * requirements of the mapping type (see calipso.h for details). Returns * zero on success and non-zero on failure. * */ static int calipso_doi_add(struct calipso_doi *doi_def, struct netlbl_audit *audit_info) { int ret_val = -EINVAL; u32 doi; u32 doi_type; struct audit_buffer *audit_buf; doi = doi_def->doi; doi_type = doi_def->type; if (doi_def->doi == CALIPSO_DOI_UNKNOWN) goto doi_add_return; refcount_set(&doi_def->refcount, 1); spin_lock(&calipso_doi_list_lock); if (calipso_doi_search(doi_def->doi)) { spin_unlock(&calipso_doi_list_lock); ret_val = -EEXIST; goto doi_add_return; } list_add_tail_rcu(&doi_def->list, &calipso_doi_list); spin_unlock(&calipso_doi_list_lock); ret_val = 0; doi_add_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_ADD, audit_info); if (audit_buf) { const char *type_str; switch (doi_type) { case CALIPSO_MAP_PASS: type_str = "pass"; break; default: type_str = "(unknown)"; } audit_log_format(audit_buf, " calipso_doi=%u calipso_type=%s res=%u", doi, type_str, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * calipso_doi_free - Frees a DOI definition * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. * */ static void calipso_doi_free(struct calipso_doi *doi_def) { kfree(doi_def); } /** * calipso_doi_free_rcu - Frees a DOI definition via the RCU pointer * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that the memory allocated to the DOI definition can be released * safely. * */ static void calipso_doi_free_rcu(struct rcu_head *entry) { struct calipso_doi *doi_def; doi_def = container_of(entry, struct calipso_doi, rcu); calipso_doi_free(doi_def); } /** * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine * @doi: the DOI value * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will * be called to release their own LSM domain mappings as well as our own * domain list. Returns zero on success and negative values on failure. * */ static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) { int ret_val; struct calipso_doi *doi_def; struct audit_buffer *audit_buf; spin_lock(&calipso_doi_list_lock); doi_def = calipso_doi_search(doi); if (!doi_def) { spin_unlock(&calipso_doi_list_lock); ret_val = -ENOENT; goto doi_remove_return; } list_del_rcu(&doi_def->list); spin_unlock(&calipso_doi_list_lock); calipso_doi_putdef(doi_def); ret_val = 0; doi_remove_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_DEL, audit_info); if (audit_buf) { audit_log_format(audit_buf, " calipso_doi=%u res=%u", doi, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * calipso_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value * * Description: * Searches for a valid DOI definition and if one is found it is returned to * the caller. Otherwise NULL is returned. The caller must ensure that * calipso_doi_putdef() is called when the caller is done. * */ static struct calipso_doi *calipso_doi_getdef(u32 doi) { struct calipso_doi *doi_def; rcu_read_lock(); doi_def = calipso_doi_search(doi); if (!doi_def) goto doi_getdef_return; if (!refcount_inc_not_zero(&doi_def->refcount)) doi_def = NULL; doi_getdef_return: rcu_read_unlock(); return doi_def; } /** * calipso_doi_putdef - Releases a reference for the given DOI definition * @doi_def: the DOI definition * * Description: * Releases a DOI definition reference obtained from calipso_doi_getdef(). * */ static void calipso_doi_putdef(struct calipso_doi *doi_def) { if (!doi_def) return; if (!refcount_dec_and_test(&doi_def->refcount)) return; calipso_cache_invalidate(); call_rcu(&doi_def->rcu, calipso_doi_free_rcu); } /** * calipso_doi_walk - Iterate through the DOI definitions * @skip_cnt: skip past this number of DOI definitions, updated * @callback: callback for each DOI definition * @cb_arg: argument for the callback function * * Description: * Iterate over the DOI definition list, skipping the first @skip_cnt entries. * For each entry call @callback, if @callback returns a negative value stop * 'walking' through the list and return. Updates the value in @skip_cnt upon * return. Returns zero on success, negative values on failure. * */ static int calipso_doi_walk(u32 *skip_cnt, int (*callback)(struct calipso_doi *doi_def, void *arg), void *cb_arg) { int ret_val = -ENOENT; u32 doi_cnt = 0; struct calipso_doi *iter_doi; rcu_read_lock(); list_for_each_entry_rcu(iter_doi, &calipso_doi_list, list) if (refcount_read(&iter_doi->refcount) > 0) { if (doi_cnt++ < *skip_cnt) continue; ret_val = callback(iter_doi, cb_arg); if (ret_val < 0) { doi_cnt--; goto doi_walk_return; } } doi_walk_return: rcu_read_unlock(); *skip_cnt = doi_cnt; return ret_val; } /** * calipso_validate - Validate a CALIPSO option * @skb: the packet * @option: the start of the option * * Description: * This routine is called to validate a CALIPSO option. * If the option is valid then %true is returned, otherwise * %false is returned. * * The caller should have already checked that the length of the * option (including the TLV header) is >= 10 and that the catmap * length is consistent with the option length. * * We leave checks on the level and categories to the socket layer. */ bool calipso_validate(const struct sk_buff *skb, const unsigned char *option) { struct calipso_doi *doi_def; bool ret_val; u16 crc, len = option[1] + 2; static const u8 zero[2]; /* The original CRC runs over the option including the TLV header * with the CRC-16 field (at offset 8) zeroed out. */ crc = crc_ccitt(0xffff, option, 8); crc = crc_ccitt(crc, zero, sizeof(zero)); if (len > 10) crc = crc_ccitt(crc, option + 10, len - 10); crc = ~crc; if (option[8] != (crc & 0xff) || option[9] != ((crc >> 8) & 0xff)) return false; rcu_read_lock(); doi_def = calipso_doi_search(get_unaligned_be32(option + 2)); ret_val = !!doi_def; rcu_read_unlock(); return ret_val; } /** * calipso_map_cat_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category bitmap in network/CALIPSO format * @net_cat_len: the length of the CALIPSO bitmap in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CALIPSO bitmap using the given DOI definition. Returns the minimum * size in bytes of the network bitmap on success, negative values otherwise. * */ static int calipso_map_cat_hton(const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int spot = -1; u32 net_spot_max = 0; u32 net_clen_bits = net_cat_len * 8; for (;;) { spot = netlbl_catmap_walk(secattr->attr.mls.cat, spot + 1); if (spot < 0) break; if (spot >= net_clen_bits) return -ENOSPC; netlbl_bitmap_setbit(net_cat, spot, 1); if (spot > net_spot_max) net_spot_max = spot; } return (net_spot_max / 32 + 1) * 4; } /** * calipso_map_cat_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category bitmap in network/CALIPSO format * @net_cat_len: the length of the CALIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CALIPSO bitmap to the correct local * MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; int spot = -1; u32 net_clen_bits = net_cat_len * 8; for (;;) { spot = netlbl_bitmap_walk(net_cat, net_clen_bits, spot + 1, 1); if (spot < 0) return 0; ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, spot, GFP_ATOMIC); if (ret_val != 0) return ret_val; } return -EINVAL; } /** * calipso_pad_write - Writes pad bytes in TLV format * @buf: the buffer * @offset: offset from start of buffer to write padding * @count: number of pad bytes to write * * Description: * Write @count bytes of TLV padding into @buffer starting at offset @offset. * @count should be less than 8 - see RFC 4942. * */ static int calipso_pad_write(unsigned char *buf, unsigned int offset, unsigned int count) { if (WARN_ON_ONCE(count >= 8)) return -EINVAL; switch (count) { case 0: break; case 1: buf[offset] = IPV6_TLV_PAD1; break; default: buf[offset] = IPV6_TLV_PADN; buf[offset + 1] = count - 2; if (count > 2) memset(buf + offset + 2, 0, count - 2); break; } return 0; } /** * calipso_genopt - Generate a CALIPSO option * @buf: the option buffer * @start: offset from which to write * @buf_len: the size of opt_buf * @doi_def: the CALIPSO DOI to use * @secattr: the security attributes * * Description: * Generate a CALIPSO option using the DOI definition and security attributes * passed to the function. This also generates upto three bytes of leading * padding that ensures that the option is 4n + 2 aligned. It returns the * number of bytes written (including any initial padding). */ static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; u32 len, pad; u16 crc; static const unsigned char padding[4] = {2, 1, 0, 3}; unsigned char *calipso; /* CALIPSO has 4n + 2 alignment */ pad = padding[start & 3]; if (buf_len <= start + pad + CALIPSO_HDR_LEN) return -ENOSPC; if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) return -EPERM; len = CALIPSO_HDR_LEN; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = calipso_map_cat_hton(doi_def, secattr, buf + start + pad + len, buf_len - start - pad - len); if (ret_val < 0) return ret_val; len += ret_val; } calipso_pad_write(buf, start, pad); calipso = buf + start + pad; calipso[0] = IPV6_TLV_CALIPSO; calipso[1] = len - 2; *(__be32 *)(calipso + 2) = htonl(doi_def->doi); calipso[6] = (len - CALIPSO_HDR_LEN) / 4; calipso[7] = secattr->attr.mls.lvl; crc = ~crc_ccitt(0xffff, calipso, len); calipso[8] = crc & 0xff; calipso[9] = (crc >> 8) & 0xff; return pad + len; } /* Hop-by-hop hdr helper functions */ /** * calipso_opt_update - Replaces socket's hop options with a new set * @sk: the socket * @hop: new hop options * * Description: * Replaces @sk's hop options with @hop. @hop may be NULL to leave * the socket with no hop options. * */ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop) { struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts; txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop); txopt_put(old); if (IS_ERR(txopts)) return PTR_ERR(txopts); txopts = ipv6_update_options(sk, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } return 0; } /** * calipso_tlv_len - Returns the length of the TLV * @opt: the option header * @offset: offset of the TLV within the header * * Description: * Returns the length of the TLV option at offset @offset within * the option header @opt. Checks that the entire TLV fits inside * the option header, returns a negative value if this is not the case. */ static int calipso_tlv_len(struct ipv6_opt_hdr *opt, unsigned int offset) { unsigned char *tlv = (unsigned char *)opt; unsigned int opt_len = ipv6_optlen(opt), tlv_len; if (offset < sizeof(*opt) || offset >= opt_len) return -EINVAL; if (tlv[offset] == IPV6_TLV_PAD1) return 1; if (offset + 1 >= opt_len) return -EINVAL; tlv_len = tlv[offset + 1] + 2; if (offset + tlv_len > opt_len) return -EINVAL; return tlv_len; } /** * calipso_opt_find - Finds the CALIPSO option in an IPv6 hop options header * @hop: the hop options header * @start: on return holds the offset of any leading padding * @end: on return holds the offset of the first non-pad TLV after CALIPSO * * Description: * Finds the space occupied by a CALIPSO option (including any leading and * trailing padding). * * If a CALIPSO option exists set @start and @end to the * offsets within @hop of the start of padding before the first * CALIPSO option and the end of padding after the first CALIPSO * option. In this case the function returns 0. * * In the absence of a CALIPSO option, @start and @end will be * set to the start and end of any trailing padding in the header. * This is useful when appending a new option, as the caller may want * to overwrite some of this padding. In this case the function will * return -ENOENT. */ static int calipso_opt_find(struct ipv6_opt_hdr *hop, unsigned int *start, unsigned int *end) { int ret_val = -ENOENT, tlv_len; unsigned int opt_len, offset, offset_s = 0, offset_e = 0; unsigned char *opt = (unsigned char *)hop; opt_len = ipv6_optlen(hop); offset = sizeof(*hop); while (offset < opt_len) { tlv_len = calipso_tlv_len(hop, offset); if (tlv_len < 0) return tlv_len; switch (opt[offset]) { case IPV6_TLV_PAD1: case IPV6_TLV_PADN: if (offset_e) offset_e = offset; break; case IPV6_TLV_CALIPSO: ret_val = 0; offset_e = offset; break; default: if (offset_e == 0) offset_s = offset; else goto out; } offset += tlv_len; } out: if (offset_s) *start = offset_s + calipso_tlv_len(hop, offset_s); else *start = sizeof(*hop); if (offset_e) *end = offset_e + calipso_tlv_len(hop, offset_e); else *end = opt_len; return ret_val; } /** * calipso_opt_insert - Inserts a CALIPSO option into an IPv6 hop opt hdr * @hop: the original hop options header * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Creates a new hop options header based on @hop with a * CALIPSO option added to it. If @hop already contains a CALIPSO * option this is overwritten, otherwise the new option is appended * after any existing options. If @hop is NULL then the new header * will contain just the CALIPSO option and any needed padding. * */ static struct ipv6_opt_hdr * calipso_opt_insert(struct ipv6_opt_hdr *hop, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { unsigned int start, end, buf_len, pad, hop_len; struct ipv6_opt_hdr *new; int ret_val; if (hop) { hop_len = ipv6_optlen(hop); ret_val = calipso_opt_find(hop, &start, &end); if (ret_val && ret_val != -ENOENT) return ERR_PTR(ret_val); } else { hop_len = 0; start = sizeof(*hop); end = 0; } buf_len = hop_len + start - end + CALIPSO_OPT_LEN_MAX_WITH_PAD; new = kzalloc(buf_len, GFP_ATOMIC); if (!new) return ERR_PTR(-ENOMEM); if (start > sizeof(*hop)) memcpy(new, hop, start); ret_val = calipso_genopt((unsigned char *)new, start, buf_len, doi_def, secattr); if (ret_val < 0) { kfree(new); return ERR_PTR(ret_val); } buf_len = start + ret_val; /* At this point buf_len aligns to 4n, so (buf_len & 4) pads to 8n */ pad = ((buf_len & 4) + (end & 7)) & 7; calipso_pad_write((unsigned char *)new, buf_len, pad); buf_len += pad; if (end != hop_len) { memcpy((char *)new + buf_len, (char *)hop + end, hop_len - end); buf_len += hop_len - end; } new->nexthdr = 0; new->hdrlen = buf_len / 8 - 1; return new; } /** * calipso_opt_del - Removes the CALIPSO option from an option header * @hop: the original header * @new: the new header * * Description: * Creates a new header based on @hop without any CALIPSO option. If @hop * doesn't contain a CALIPSO option it returns -ENOENT. If @hop contains * no other non-padding options, it returns zero with @new set to NULL. * Otherwise it returns zero, creates a new header without the CALIPSO * option (and removing as much padding as possible) and returns with * @new set to that header. * */ static int calipso_opt_del(struct ipv6_opt_hdr *hop, struct ipv6_opt_hdr **new) { int ret_val; unsigned int start, end, delta, pad, hop_len; ret_val = calipso_opt_find(hop, &start, &end); if (ret_val) return ret_val; hop_len = ipv6_optlen(hop); if (start == sizeof(*hop) && end == hop_len) { /* There's no other option in the header so return NULL */ *new = NULL; return 0; } delta = (end - start) & ~7; *new = kzalloc(hop_len - delta, GFP_ATOMIC); if (!*new) return -ENOMEM; memcpy(*new, hop, start); (*new)->hdrlen -= delta / 8; pad = (end - start) & 7; calipso_pad_write((unsigned char *)*new, start, pad); if (end != hop_len) memcpy((char *)*new + start + pad, (char *)hop + end, hop_len - end); return 0; } /** * calipso_opt_getattr - Get the security attributes from a memory block * @calipso: the CALIPSO option * @secattr: the security attributes * * Description: * Inspect @calipso and return the security attributes in @secattr. * Returns zero on success and negative values on failure. * */ static int calipso_opt_getattr(const unsigned char *calipso, struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; u32 doi, len = calipso[1], cat_len = calipso[6] * 4; struct calipso_doi *doi_def; if (cat_len + 8 > len) return -EINVAL; if (calipso_cache_check(calipso + 2, calipso[1], secattr) == 0) return 0; doi = get_unaligned_be32(calipso + 2); rcu_read_lock(); doi_def = calipso_doi_search(doi); if (!doi_def) goto getattr_return; secattr->attr.mls.lvl = calipso[7]; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (cat_len) { ret_val = calipso_map_cat_ntoh(doi_def, calipso + 10, cat_len, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); goto getattr_return; } if (secattr->attr.mls.cat) secattr->flags |= NETLBL_SECATTR_MLS_CAT; } secattr->type = NETLBL_NLTYPE_CALIPSO; getattr_return: rcu_read_unlock(); return ret_val; } /* sock functions. */ /** * calipso_sock_getattr - Get the security attributes from a sock * @sk: the sock * @secattr: the security attributes * * Description: * Query @sk to see if there is a CALIPSO option attached to the sock and if * there is return the CALIPSO security attributes in @secattr. This function * requires that @sk be locked, or privately held, but it does not do any * locking itself. Returns zero on success and negative values on failure. * */ static int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { struct ipv6_opt_hdr *hop; int opt_len, len, ret_val = -ENOMSG, offset; unsigned char *opt; struct ipv6_pinfo *pinfo = inet6_sk(sk); struct ipv6_txoptions *txopts; if (!pinfo) return -EAFNOSUPPORT; txopts = txopt_get(pinfo); if (!txopts || !txopts->hopopt) goto done; hop = txopts->hopopt; opt = (unsigned char *)hop; opt_len = ipv6_optlen(hop); offset = sizeof(*hop); while (offset < opt_len) { len = calipso_tlv_len(hop, offset); if (len < 0) { ret_val = len; goto done; } switch (opt[offset]) { case IPV6_TLV_CALIPSO: if (len < CALIPSO_HDR_LEN) ret_val = -EINVAL; else ret_val = calipso_opt_getattr(&opt[offset], secattr); goto done; default: offset += len; break; } } done: txopt_put(txopts); return ret_val; } /** * calipso_sock_setattr - Add a CALIPSO option to a socket * @sk: the socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. This function requires * exclusive access to @sk, which means it either needs to be in the * process of being created or locked. Returns zero on success and negative * values on failure. * */ static int calipso_sock_setattr(struct sock *sk, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct ipv6_opt_hdr *old, *new; struct ipv6_pinfo *pinfo = inet6_sk(sk); struct ipv6_txoptions *txopts; if (!pinfo) return -EAFNOSUPPORT; txopts = txopt_get(pinfo); old = NULL; if (txopts) old = txopts->hopopt; new = calipso_opt_insert(old, doi_def, secattr); txopt_put(txopts); if (IS_ERR(new)) return PTR_ERR(new); ret_val = calipso_opt_update(sk, new); kfree(new); return ret_val; } /** * calipso_sock_delattr - Delete the CALIPSO option from a socket * @sk: the socket * * Description: * Removes the CALIPSO option from a socket, if present. * */ static void calipso_sock_delattr(struct sock *sk) { struct ipv6_opt_hdr *new_hop; struct ipv6_pinfo *pinfo = inet6_sk(sk); struct ipv6_txoptions *txopts; if (!pinfo) return; txopts = txopt_get(pinfo); if (!txopts || !txopts->hopopt) goto done; if (calipso_opt_del(txopts->hopopt, &new_hop)) goto done; calipso_opt_update(sk, new_hop); kfree(new_hop); done: txopt_put(txopts); } /* request sock functions. */ /** * calipso_req_setattr - Add a CALIPSO option to a connection request socket * @req: the connection request socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. Returns zero on success and * negative values on failure. * */ static int calipso_req_setattr(struct request_sock *req, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { struct ipv6_txoptions *txopts; struct inet_request_sock *req_inet = inet_rsk(req); struct ipv6_opt_hdr *old, *new; struct sock *sk = sk_to_full_sk(req_to_sk(req)); if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt) old = req_inet->ipv6_opt->hopopt; else old = NULL; new = calipso_opt_insert(old, doi_def, secattr); if (IS_ERR(new)) return PTR_ERR(new); txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); kfree(new); if (IS_ERR(txopts)) return PTR_ERR(txopts); txopts = xchg(&req_inet->ipv6_opt, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } return 0; } /** * calipso_req_delattr - Delete the CALIPSO option from a request socket * @req: the request socket * * Description: * Removes the CALIPSO option from a request socket, if present. * */ static void calipso_req_delattr(struct request_sock *req) { struct inet_request_sock *req_inet = inet_rsk(req); struct ipv6_opt_hdr *new; struct ipv6_txoptions *txopts; struct sock *sk = sk_to_full_sk(req_to_sk(req)); if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt) return; if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new)) return; /* Nothing to do */ txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); if (!IS_ERR(txopts)) { txopts = xchg(&req_inet->ipv6_opt, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } } kfree(new); } /* skbuff functions. */ /** * calipso_skbuff_optptr - Find the CALIPSO option in the packet * @skb: the packet * * Description: * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer * to the start of the CALIPSO option on success, NULL if one if not found. * */ static unsigned char *calipso_skbuff_optptr(const struct sk_buff *skb) { const struct ipv6hdr *ip6_hdr = ipv6_hdr(skb); int offset; if (ip6_hdr->nexthdr != NEXTHDR_HOP) return NULL; offset = ipv6_find_tlv(skb, sizeof(*ip6_hdr), IPV6_TLV_CALIPSO); if (offset >= 0) return (unsigned char *)ip6_hdr + offset; return NULL; } /** * calipso_skbuff_setattr - Set the CALIPSO option on a packet * @skb: the packet * @doi_def: the CALIPSO DOI to use * @secattr: the security attributes * * Description: * Set the CALIPSO option on the given packet based on the security attributes. * Returns a pointer to the IP header on success and NULL on failure. * */ static int calipso_skbuff_setattr(struct sk_buff *skb, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct ipv6hdr *ip6_hdr; struct ipv6_opt_hdr *hop; unsigned char buf[CALIPSO_MAX_BUFFER]; int len_delta, new_end, pad, payload; unsigned int start, end; ip6_hdr = ipv6_hdr(skb); if (ip6_hdr->nexthdr == NEXTHDR_HOP) { hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); ret_val = calipso_opt_find(hop, &start, &end); if (ret_val && ret_val != -ENOENT) return ret_val; } else { start = 0; end = 0; } memset(buf, 0, sizeof(buf)); ret_val = calipso_genopt(buf, start & 3, sizeof(buf), doi_def, secattr); if (ret_val < 0) return ret_val; new_end = start + ret_val; /* At this point new_end aligns to 4n, so (new_end & 4) pads to 8n */ pad = ((new_end & 4) + (end & 7)) & 7; len_delta = new_end - (int)end + pad; ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); if (ret_val < 0) return ret_val; ip6_hdr = ipv6_hdr(skb); /* Reset as skb_cow() may have moved it */ if (len_delta) { if (len_delta > 0) skb_push(skb, len_delta); else skb_pull(skb, -len_delta); memmove((char *)ip6_hdr - len_delta, ip6_hdr, sizeof(*ip6_hdr) + start); skb_reset_network_header(skb); ip6_hdr = ipv6_hdr(skb); payload = ntohs(ip6_hdr->payload_len); ip6_hdr->payload_len = htons(payload + len_delta); } hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); if (start == 0) { struct ipv6_opt_hdr *new_hop = (struct ipv6_opt_hdr *)buf; new_hop->nexthdr = ip6_hdr->nexthdr; new_hop->hdrlen = len_delta / 8 - 1; ip6_hdr->nexthdr = NEXTHDR_HOP; } else { hop->hdrlen += len_delta / 8; } memcpy((char *)hop + start, buf + (start & 3), new_end - start); calipso_pad_write((unsigned char *)hop, new_end, pad); return 0; } /** * calipso_skbuff_delattr - Delete any CALIPSO options from a packet * @skb: the packet * * Description: * Removes any and all CALIPSO options from the given packet. Returns zero on * success, negative values on failure. * */ static int calipso_skbuff_delattr(struct sk_buff *skb) { int ret_val; struct ipv6hdr *ip6_hdr; struct ipv6_opt_hdr *old_hop; u32 old_hop_len, start = 0, end = 0, delta, size, pad; if (!calipso_skbuff_optptr(skb)) return 0; /* since we are changing the packet we should make a copy */ ret_val = skb_cow(skb, skb_headroom(skb)); if (ret_val < 0) return ret_val; ip6_hdr = ipv6_hdr(skb); old_hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); old_hop_len = ipv6_optlen(old_hop); ret_val = calipso_opt_find(old_hop, &start, &end); if (ret_val) return ret_val; if (start == sizeof(*old_hop) && end == old_hop_len) { /* There's no other option in the header so we delete * the whole thing. */ delta = old_hop_len; size = sizeof(*ip6_hdr); ip6_hdr->nexthdr = old_hop->nexthdr; } else { delta = (end - start) & ~7; if (delta) old_hop->hdrlen -= delta / 8; pad = (end - start) & 7; size = sizeof(*ip6_hdr) + start + pad; calipso_pad_write((unsigned char *)old_hop, start, pad); } if (delta) { skb_pull(skb, delta); memmove((char *)ip6_hdr + delta, ip6_hdr, size); skb_reset_network_header(skb); } return 0; } static const struct netlbl_calipso_ops ops = { .doi_add = calipso_doi_add, .doi_free = calipso_doi_free, .doi_remove = calipso_doi_remove, .doi_getdef = calipso_doi_getdef, .doi_putdef = calipso_doi_putdef, .doi_walk = calipso_doi_walk, .sock_getattr = calipso_sock_getattr, .sock_setattr = calipso_sock_setattr, .sock_delattr = calipso_sock_delattr, .req_setattr = calipso_req_setattr, .req_delattr = calipso_req_delattr, .opt_getattr = calipso_opt_getattr, .skbuff_optptr = calipso_skbuff_optptr, .skbuff_setattr = calipso_skbuff_setattr, .skbuff_delattr = calipso_skbuff_delattr, .cache_invalidate = calipso_cache_invalidate, .cache_add = calipso_cache_add }; /** * calipso_init - Initialize the CALIPSO module * * Description: * Initialize the CALIPSO module and prepare it for use. Returns zero on * success and negative values on failure. * */ int __init calipso_init(void) { int ret_val; ret_val = calipso_cache_init(); if (!ret_val) netlbl_calipso_ops_register(&ops); return ret_val; } void calipso_exit(void) { netlbl_calipso_ops_register(NULL); calipso_cache_invalidate(); kfree(calipso_cache); } |
5 5 1 4 23 23 6 22 6 4 4 6 21 21 2 20 3 3 3 3 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C)2003,2004 USAGI/WIDE Project * * Authors Mitsuru KANDA <mk@linux-ipv6.org> * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/xfrm.h> static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly; static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly; static struct xfrm6_tunnel __rcu *tunnelmpls6_handlers __read_mostly; static DEFINE_MUTEX(tunnel6_mutex); static inline int xfrm6_tunnel_mpls_supported(void) { return IS_ENABLED(CONFIG_MPLS); } int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) { struct xfrm6_tunnel __rcu **pprev; struct xfrm6_tunnel *t; int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel6_mutex); switch (family) { case AF_INET6: pprev = &tunnel6_handlers; break; case AF_INET: pprev = &tunnel46_handlers; break; case AF_MPLS: pprev = &tunnelmpls6_handlers; break; default: goto err; } for (; (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel6_mutex))) != NULL; pprev = &t->next) { if (t->priority > priority) break; if (t->priority == priority) goto err; } handler->next = *pprev; rcu_assign_pointer(*pprev, handler); ret = 0; err: mutex_unlock(&tunnel6_mutex); return ret; } EXPORT_SYMBOL(xfrm6_tunnel_register); int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) { struct xfrm6_tunnel __rcu **pprev; struct xfrm6_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel6_mutex); switch (family) { case AF_INET6: pprev = &tunnel6_handlers; break; case AF_INET: pprev = &tunnel46_handlers; break; case AF_MPLS: pprev = &tunnelmpls6_handlers; break; default: goto err; } for (; (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel6_mutex))) != NULL; pprev = &t->next) { if (t == handler) { *pprev = handler->next; ret = 0; break; } } err: mutex_unlock(&tunnel6_mutex); synchronize_net(); return ret; } EXPORT_SYMBOL(xfrm6_tunnel_deregister); #define for_each_tunnel_rcu(head, handler) \ for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ static int tunnelmpls6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for_each_tunnel_rcu(tunnelmpls6_handlers, handler) if (!handler->handler(skb)) return 0; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } static int tunnel6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->handler(skb)) return 0; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) static int tunnel6_rcv_cb(struct sk_buff *skb, u8 proto, int err) { struct xfrm6_tunnel __rcu *head; struct xfrm6_tunnel *handler; int ret; head = (proto == IPPROTO_IPV6) ? tunnel6_handlers : tunnel46_handlers; for_each_tunnel_rcu(head, handler) { if (handler->cb_handler) { ret = handler->cb_handler(skb, err); if (ret <= 0) return ret; } } return 0; } static const struct xfrm_input_afinfo tunnel6_input_afinfo = { .family = AF_INET6, .is_ipip = true, .callback = tunnel6_rcv_cb, }; #endif static int tunnel46_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->handler(skb)) return 0; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } static int tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static int tunnelmpls6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct xfrm6_tunnel *handler; for_each_tunnel_rcu(tunnelmpls6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) return 0; return -ENOENT; } static const struct inet6_protocol tunnel6_protocol = { .handler = tunnel6_rcv, .err_handler = tunnel6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; static const struct inet6_protocol tunnel46_protocol = { .handler = tunnel46_rcv, .err_handler = tunnel46_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; static const struct inet6_protocol tunnelmpls6_protocol = { .handler = tunnelmpls6_rcv, .err_handler = tunnelmpls6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; static int __init tunnel6_init(void) { if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) { pr_err("%s: can't add protocol\n", __func__); return -EAGAIN; } if (inet6_add_protocol(&tunnel46_protocol, IPPROTO_IPIP)) { pr_err("%s: can't add protocol\n", __func__); inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); return -EAGAIN; } if (xfrm6_tunnel_mpls_supported() && inet6_add_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) { pr_err("%s: can't add protocol\n", __func__); inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP); return -EAGAIN; } #if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) if (xfrm_input_register_afinfo(&tunnel6_input_afinfo)) { pr_err("%s: can't add input afinfo\n", __func__); inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP); if (xfrm6_tunnel_mpls_supported()) inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS); return -EAGAIN; } #endif return 0; } static void __exit tunnel6_fini(void) { #if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL) if (xfrm_input_unregister_afinfo(&tunnel6_input_afinfo)) pr_err("%s: can't remove input afinfo\n", __func__); #endif if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP)) pr_err("%s: can't remove protocol\n", __func__); if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6)) pr_err("%s: can't remove protocol\n", __func__); if (xfrm6_tunnel_mpls_supported() && inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) pr_err("%s: can't remove protocol\n", __func__); } module_init(tunnel6_init); module_exit(tunnel6_fini); MODULE_DESCRIPTION("IP-in-IPv6 tunnel driver"); MODULE_LICENSE("GPL"); |
17 19 18 1 18 1 17 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | // SPDX-License-Identifier: GPL-2.0-only /* * Module for modifying the secmark field of the skb, for use by * security subsystems. * * Based on the nfmark match by: * (C) 1999-2001 Marc Boucher <marc@mbsi.ca> * * (C) 2006,2008 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/security.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SECMARK.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris@redhat.com>"); MODULE_DESCRIPTION("Xtables: packet security mark modification"); MODULE_ALIAS("ipt_SECMARK"); MODULE_ALIAS("ip6t_SECMARK"); static u8 mode; static unsigned int secmark_tg(struct sk_buff *skb, const struct xt_secmark_target_info_v1 *info) { u32 secmark = 0; switch (mode) { case SECMARK_MODE_SEL: secmark = info->secid; break; default: BUG(); } skb->secmark = secmark; return XT_CONTINUE; } static int checkentry_lsm(struct xt_secmark_target_info_v1 *info) { int err; info->secctx[SECMARK_SECCTX_MAX - 1] = '\0'; info->secid = 0; err = security_secctx_to_secid(info->secctx, strlen(info->secctx), &info->secid); if (err) { if (err == -EINVAL) pr_info_ratelimited("invalid security context \'%s\'\n", info->secctx); return err; } if (!info->secid) { pr_info_ratelimited("unable to map security context \'%s\'\n", info->secctx); return -ENOENT; } err = security_secmark_relabel_packet(info->secid); if (err) { pr_info_ratelimited("unable to obtain relabeling permission\n"); return err; } security_secmark_refcount_inc(); return 0; } static int secmark_tg_check(const char *table, struct xt_secmark_target_info_v1 *info) { int err; if (strcmp(table, "mangle") != 0 && strcmp(table, "security") != 0) { pr_info_ratelimited("only valid in \'mangle\' or \'security\' table, not \'%s\'\n", table); return -EINVAL; } if (mode && mode != info->mode) { pr_info_ratelimited("mode already set to %hu cannot mix with rules for mode %hu\n", mode, info->mode); return -EINVAL; } switch (info->mode) { case SECMARK_MODE_SEL: break; default: pr_info_ratelimited("invalid mode: %hu\n", info->mode); return -EINVAL; } err = checkentry_lsm(info); if (err) return err; if (!mode) mode = info->mode; return 0; } static void secmark_tg_destroy(const struct xt_tgdtor_param *par) { switch (mode) { case SECMARK_MODE_SEL: security_secmark_refcount_dec(); } } static int secmark_tg_check_v0(const struct xt_tgchk_param *par) { struct xt_secmark_target_info *info = par->targinfo; struct xt_secmark_target_info_v1 newinfo = { .mode = info->mode, }; int ret; memcpy(newinfo.secctx, info->secctx, SECMARK_SECCTX_MAX); ret = secmark_tg_check(par->table, &newinfo); info->secid = newinfo.secid; return ret; } static unsigned int secmark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_secmark_target_info *info = par->targinfo; struct xt_secmark_target_info_v1 newinfo = { .secid = info->secid, }; return secmark_tg(skb, &newinfo); } static int secmark_tg_check_v1(const struct xt_tgchk_param *par) { return secmark_tg_check(par->table, par->targinfo); } static unsigned int secmark_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) { return secmark_tg(skb, par->targinfo); } static struct xt_target secmark_tg_reg[] __read_mostly = { { .name = "SECMARK", .revision = 0, .family = NFPROTO_IPV4, .checkentry = secmark_tg_check_v0, .destroy = secmark_tg_destroy, .target = secmark_tg_v0, .targetsize = sizeof(struct xt_secmark_target_info), .me = THIS_MODULE, }, { .name = "SECMARK", .revision = 1, .family = NFPROTO_IPV4, .checkentry = secmark_tg_check_v1, .destroy = secmark_tg_destroy, .target = secmark_tg_v1, .targetsize = sizeof(struct xt_secmark_target_info_v1), .usersize = offsetof(struct xt_secmark_target_info_v1, secid), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "SECMARK", .revision = 0, .family = NFPROTO_IPV6, .checkentry = secmark_tg_check_v0, .destroy = secmark_tg_destroy, .target = secmark_tg_v0, .targetsize = sizeof(struct xt_secmark_target_info), .me = THIS_MODULE, }, { .name = "SECMARK", .revision = 1, .family = NFPROTO_IPV6, .checkentry = secmark_tg_check_v1, .destroy = secmark_tg_destroy, .target = secmark_tg_v1, .targetsize = sizeof(struct xt_secmark_target_info_v1), .usersize = offsetof(struct xt_secmark_target_info_v1, secid), .me = THIS_MODULE, }, #endif }; static int __init secmark_tg_init(void) { return xt_register_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg)); } static void __exit secmark_tg_exit(void) { xt_unregister_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg)); } module_init(secmark_tg_init); module_exit(secmark_tg_exit); |
50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 | /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * This file holds USB constants and structures that are needed for * USB device APIs. These are used by the USB device model, which is * defined in chapter 9 of the USB 2.0 specification and in the * Wireless USB 1.0 spec (now defunct). Linux has several APIs in C that * need these: * * - the master/host side Linux-USB kernel driver API; * - the "usbfs" user space API; and * - the Linux "gadget" slave/device/peripheral side driver API. * * USB 2.0 adds an additional "On The Go" (OTG) mode, which lets systems * act either as a USB master/host or as a USB slave/device. That means * the master and slave side APIs benefit from working well together. * * Note all descriptors are declared '__attribute__((packed))' so that: * * [a] they never get padded, either internally (USB spec writers * probably handled that) or externally; * * [b] so that accessing bigger-than-a-bytes fields will never * generate bus errors on any platform, even when the location of * its descriptor inside a bundle isn't "naturally aligned", and * * [c] for consistency, removing all doubt even when it appears to * someone that the two other points are non-issues for that * particular descriptor type. */ #ifndef _UAPI__LINUX_USB_CH9_H #define _UAPI__LINUX_USB_CH9_H #include <linux/types.h> /* __u8 etc */ #include <asm/byteorder.h> /* le16_to_cpu */ /*-------------------------------------------------------------------------*/ /* CONTROL REQUEST SUPPORT */ /* * USB directions * * This bit flag is used in endpoint descriptors' bEndpointAddress field. * It's also one of three fields in control requests bRequestType. */ #define USB_DIR_OUT 0 /* to device */ #define USB_DIR_IN 0x80 /* to host */ /* * USB types, the second of three bRequestType fields */ #define USB_TYPE_MASK (0x03 << 5) #define USB_TYPE_STANDARD (0x00 << 5) #define USB_TYPE_CLASS (0x01 << 5) #define USB_TYPE_VENDOR (0x02 << 5) #define USB_TYPE_RESERVED (0x03 << 5) /* * USB recipients, the third of three bRequestType fields */ #define USB_RECIP_MASK 0x1f #define USB_RECIP_DEVICE 0x00 #define USB_RECIP_INTERFACE 0x01 #define USB_RECIP_ENDPOINT 0x02 #define USB_RECIP_OTHER 0x03 /* From Wireless USB 1.0 */ #define USB_RECIP_PORT 0x04 #define USB_RECIP_RPIPE 0x05 /* * Standard requests, for the bRequest field of a SETUP packet. * * These are qualified by the bRequestType field, so that for example * TYPE_CLASS or TYPE_VENDOR specific feature flags could be retrieved * by a GET_STATUS request. */ #define USB_REQ_GET_STATUS 0x00 #define USB_REQ_CLEAR_FEATURE 0x01 #define USB_REQ_SET_FEATURE 0x03 #define USB_REQ_SET_ADDRESS 0x05 #define USB_REQ_GET_DESCRIPTOR 0x06 #define USB_REQ_SET_DESCRIPTOR 0x07 #define USB_REQ_GET_CONFIGURATION 0x08 #define USB_REQ_SET_CONFIGURATION 0x09 #define USB_REQ_GET_INTERFACE 0x0A #define USB_REQ_SET_INTERFACE 0x0B #define USB_REQ_SYNCH_FRAME 0x0C #define USB_REQ_SET_SEL 0x30 #define USB_REQ_SET_ISOCH_DELAY 0x31 #define USB_REQ_SET_ENCRYPTION 0x0D /* Wireless USB */ #define USB_REQ_GET_ENCRYPTION 0x0E #define USB_REQ_RPIPE_ABORT 0x0E #define USB_REQ_SET_HANDSHAKE 0x0F #define USB_REQ_RPIPE_RESET 0x0F #define USB_REQ_GET_HANDSHAKE 0x10 #define USB_REQ_SET_CONNECTION 0x11 #define USB_REQ_SET_SECURITY_DATA 0x12 #define USB_REQ_GET_SECURITY_DATA 0x13 #define USB_REQ_SET_WUSB_DATA 0x14 #define USB_REQ_LOOPBACK_DATA_WRITE 0x15 #define USB_REQ_LOOPBACK_DATA_READ 0x16 #define USB_REQ_SET_INTERFACE_DS 0x17 /* specific requests for USB Power Delivery */ #define USB_REQ_GET_PARTNER_PDO 20 #define USB_REQ_GET_BATTERY_STATUS 21 #define USB_REQ_SET_PDO 22 #define USB_REQ_GET_VDM 23 #define USB_REQ_SEND_VDM 24 /* The Link Power Management (LPM) ECN defines USB_REQ_TEST_AND_SET command, * used by hubs to put ports into a new L1 suspend state, except that it * forgot to define its number ... */ /* * USB feature flags are written using USB_REQ_{CLEAR,SET}_FEATURE, and * are read as a bit array returned by USB_REQ_GET_STATUS. (So there * are at most sixteen features of each type.) Hubs may also support a * new USB_REQ_TEST_AND_SET_FEATURE to put ports into L1 suspend. */ #define USB_DEVICE_SELF_POWERED 0 /* (read only) */ #define USB_DEVICE_REMOTE_WAKEUP 1 /* dev may initiate wakeup */ #define USB_DEVICE_TEST_MODE 2 /* (wired high speed only) */ #define USB_DEVICE_BATTERY 2 /* (wireless) */ #define USB_DEVICE_B_HNP_ENABLE 3 /* (otg) dev may initiate HNP */ #define USB_DEVICE_WUSB_DEVICE 3 /* (wireless)*/ #define USB_DEVICE_A_HNP_SUPPORT 4 /* (otg) RH port supports HNP */ #define USB_DEVICE_A_ALT_HNP_SUPPORT 5 /* (otg) other RH port does */ #define USB_DEVICE_DEBUG_MODE 6 /* (special devices only) */ /* * Test Mode Selectors * See USB 2.0 spec Table 9-7 */ #define USB_TEST_J 1 #define USB_TEST_K 2 #define USB_TEST_SE0_NAK 3 #define USB_TEST_PACKET 4 #define USB_TEST_FORCE_ENABLE 5 /* Status Type */ #define USB_STATUS_TYPE_STANDARD 0 #define USB_STATUS_TYPE_PTM 1 /* * New Feature Selectors as added by USB 3.0 * See USB 3.0 spec Table 9-7 */ #define USB_DEVICE_U1_ENABLE 48 /* dev may initiate U1 transition */ #define USB_DEVICE_U2_ENABLE 49 /* dev may initiate U2 transition */ #define USB_DEVICE_LTM_ENABLE 50 /* dev may send LTM */ #define USB_INTRF_FUNC_SUSPEND 0 /* function suspend */ #define USB_INTR_FUNC_SUSPEND_OPT_MASK 0xFF00 /* * Suspend Options, Table 9-8 USB 3.0 spec */ #define USB_INTRF_FUNC_SUSPEND_LP (1 << (8 + 0)) #define USB_INTRF_FUNC_SUSPEND_RW (1 << (8 + 1)) /* * Interface status, Figure 9-5 USB 3.0 spec */ #define USB_INTRF_STAT_FUNC_RW_CAP 1 #define USB_INTRF_STAT_FUNC_RW 2 #define USB_ENDPOINT_HALT 0 /* IN/OUT will STALL */ /* Bit array elements as returned by the USB_REQ_GET_STATUS request. */ #define USB_DEV_STAT_U1_ENABLED 2 /* transition into U1 state */ #define USB_DEV_STAT_U2_ENABLED 3 /* transition into U2 state */ #define USB_DEV_STAT_LTM_ENABLED 4 /* Latency tolerance messages */ /* * Feature selectors from Table 9-8 USB Power Delivery spec */ #define USB_DEVICE_BATTERY_WAKE_MASK 40 #define USB_DEVICE_OS_IS_PD_AWARE 41 #define USB_DEVICE_POLICY_MODE 42 #define USB_PORT_PR_SWAP 43 #define USB_PORT_GOTO_MIN 44 #define USB_PORT_RETURN_POWER 45 #define USB_PORT_ACCEPT_PD_REQUEST 46 #define USB_PORT_REJECT_PD_REQUEST 47 #define USB_PORT_PORT_PD_RESET 48 #define USB_PORT_C_PORT_PD_CHANGE 49 #define USB_PORT_CABLE_PD_RESET 50 #define USB_DEVICE_CHARGING_POLICY 54 /** * struct usb_ctrlrequest - SETUP data for a USB device control request * @bRequestType: matches the USB bmRequestType field * @bRequest: matches the USB bRequest field * @wValue: matches the USB wValue field (le16 byte order) * @wIndex: matches the USB wIndex field (le16 byte order) * @wLength: matches the USB wLength field (le16 byte order) * * This structure is used to send control requests to a USB device. It matches * the different fields of the USB 2.0 Spec section 9.3, table 9-2. See the * USB spec for a fuller description of the different fields, and what they are * used for. * * Note that the driver for any interface can issue control requests. * For most devices, interfaces don't coordinate with each other, so * such requests may be made at any time. */ struct usb_ctrlrequest { __u8 bRequestType; __u8 bRequest; __le16 wValue; __le16 wIndex; __le16 wLength; } __attribute__ ((packed)); /*-------------------------------------------------------------------------*/ /* * STANDARD DESCRIPTORS ... as returned by GET_DESCRIPTOR, or * (rarely) accepted by SET_DESCRIPTOR. * * Note that all multi-byte values here are encoded in little endian * byte order "on the wire". Within the kernel and when exposed * through the Linux-USB APIs, they are not converted to cpu byte * order; it is the responsibility of the client code to do this. * The single exception is when device and configuration descriptors (but * not other descriptors) are read from character devices * (i.e. /dev/bus/usb/BBB/DDD); * in this case the fields are converted to host endianness by the kernel. */ /* * Descriptor types ... USB 2.0 spec table 9.5 */ #define USB_DT_DEVICE 0x01 #define USB_DT_CONFIG 0x02 #define USB_DT_STRING 0x03 #define USB_DT_INTERFACE 0x04 #define USB_DT_ENDPOINT 0x05 #define USB_DT_DEVICE_QUALIFIER 0x06 #define USB_DT_OTHER_SPEED_CONFIG 0x07 #define USB_DT_INTERFACE_POWER 0x08 /* these are from a minor usb 2.0 revision (ECN) */ #define USB_DT_OTG 0x09 #define USB_DT_DEBUG 0x0a #define USB_DT_INTERFACE_ASSOCIATION 0x0b /* these are from the Wireless USB spec */ #define USB_DT_SECURITY 0x0c #define USB_DT_KEY 0x0d #define USB_DT_ENCRYPTION_TYPE 0x0e #define USB_DT_BOS 0x0f #define USB_DT_DEVICE_CAPABILITY 0x10 #define USB_DT_WIRELESS_ENDPOINT_COMP 0x11 /* From the eUSB2 spec */ #define USB_DT_EUSB2_ISOC_ENDPOINT_COMP 0x12 /* From Wireless USB spec */ #define USB_DT_WIRE_ADAPTER 0x21 /* From USB Device Firmware Upgrade Specification, Revision 1.1 */ #define USB_DT_DFU_FUNCTIONAL 0x21 /* these are from the Wireless USB spec */ #define USB_DT_RPIPE 0x22 #define USB_DT_CS_RADIO_CONTROL 0x23 /* From the T10 UAS specification */ #define USB_DT_PIPE_USAGE 0x24 /* From the USB 3.0 spec */ #define USB_DT_SS_ENDPOINT_COMP 0x30 /* From the USB 3.1 spec */ #define USB_DT_SSP_ISOC_ENDPOINT_COMP 0x31 /* Conventional codes for class-specific descriptors. The convention is * defined in the USB "Common Class" Spec (3.11). Individual class specs * are authoritative for their usage, not the "common class" writeup. */ #define USB_DT_CS_DEVICE (USB_TYPE_CLASS | USB_DT_DEVICE) #define USB_DT_CS_CONFIG (USB_TYPE_CLASS | USB_DT_CONFIG) #define USB_DT_CS_STRING (USB_TYPE_CLASS | USB_DT_STRING) #define USB_DT_CS_INTERFACE (USB_TYPE_CLASS | USB_DT_INTERFACE) #define USB_DT_CS_ENDPOINT (USB_TYPE_CLASS | USB_DT_ENDPOINT) /* All standard descriptors have these 2 fields at the beginning */ struct usb_descriptor_header { __u8 bLength; __u8 bDescriptorType; } __attribute__ ((packed)); /*-------------------------------------------------------------------------*/ /* USB_DT_DEVICE: Device descriptor */ struct usb_device_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 bcdUSB; __u8 bDeviceClass; __u8 bDeviceSubClass; __u8 bDeviceProtocol; __u8 bMaxPacketSize0; __le16 idVendor; __le16 idProduct; __le16 bcdDevice; __u8 iManufacturer; __u8 iProduct; __u8 iSerialNumber; __u8 bNumConfigurations; } __attribute__ ((packed)); #define USB_DT_DEVICE_SIZE 18 /* * Device and/or Interface Class codes * as found in bDeviceClass or bInterfaceClass * and defined by www.usb.org documents */ #define USB_CLASS_PER_INTERFACE 0 /* for DeviceClass */ #define USB_CLASS_AUDIO 1 #define USB_CLASS_COMM 2 #define USB_CLASS_HID 3 #define USB_CLASS_PHYSICAL 5 #define USB_CLASS_STILL_IMAGE 6 #define USB_CLASS_PRINTER 7 #define USB_CLASS_MASS_STORAGE 8 #define USB_CLASS_HUB 9 #define USB_CLASS_CDC_DATA 0x0a #define USB_CLASS_CSCID 0x0b /* chip+ smart card */ #define USB_CLASS_CONTENT_SEC 0x0d /* content security */ #define USB_CLASS_VIDEO 0x0e #define USB_CLASS_WIRELESS_CONTROLLER 0xe0 #define USB_CLASS_PERSONAL_HEALTHCARE 0x0f #define USB_CLASS_AUDIO_VIDEO 0x10 #define USB_CLASS_BILLBOARD 0x11 #define USB_CLASS_USB_TYPE_C_BRIDGE 0x12 #define USB_CLASS_MCTP 0x14 #define USB_CLASS_MISC 0xef #define USB_CLASS_APP_SPEC 0xfe #define USB_SUBCLASS_DFU 0x01 #define USB_CLASS_VENDOR_SPEC 0xff #define USB_SUBCLASS_VENDOR_SPEC 0xff /*-------------------------------------------------------------------------*/ /* USB_DT_CONFIG: Configuration descriptor information. * * USB_DT_OTHER_SPEED_CONFIG is the same descriptor, except that the * descriptor type is different. Highspeed-capable devices can look * different depending on what speed they're currently running. Only * devices with a USB_DT_DEVICE_QUALIFIER have any OTHER_SPEED_CONFIG * descriptors. */ struct usb_config_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 wTotalLength; __u8 bNumInterfaces; __u8 bConfigurationValue; __u8 iConfiguration; __u8 bmAttributes; __u8 bMaxPower; } __attribute__ ((packed)); #define USB_DT_CONFIG_SIZE 9 /* from config descriptor bmAttributes */ #define USB_CONFIG_ATT_ONE (1 << 7) /* must be set */ #define USB_CONFIG_ATT_SELFPOWER (1 << 6) /* self powered */ #define USB_CONFIG_ATT_WAKEUP (1 << 5) /* can wakeup */ #define USB_CONFIG_ATT_BATTERY (1 << 4) /* battery powered */ /*-------------------------------------------------------------------------*/ /* USB String descriptors can contain at most 126 characters. */ #define USB_MAX_STRING_LEN 126 /* USB_DT_STRING: String descriptor */ struct usb_string_descriptor { __u8 bLength; __u8 bDescriptorType; union { __le16 legacy_padding; __DECLARE_FLEX_ARRAY(__le16, wData); /* UTF-16LE encoded */ }; } __attribute__ ((packed)); /* note that "string" zero is special, it holds language codes that * the device supports, not Unicode characters. */ /*-------------------------------------------------------------------------*/ /* USB_DT_INTERFACE: Interface descriptor */ struct usb_interface_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bInterfaceNumber; __u8 bAlternateSetting; __u8 bNumEndpoints; __u8 bInterfaceClass; __u8 bInterfaceSubClass; __u8 bInterfaceProtocol; __u8 iInterface; } __attribute__ ((packed)); #define USB_DT_INTERFACE_SIZE 9 /*-------------------------------------------------------------------------*/ /* USB_DT_ENDPOINT: Endpoint descriptor */ struct usb_endpoint_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bEndpointAddress; __u8 bmAttributes; __le16 wMaxPacketSize; __u8 bInterval; /* NOTE: these two are _only_ in audio endpoints. */ /* use USB_DT_ENDPOINT*_SIZE in bLength, not sizeof. */ __u8 bRefresh; __u8 bSynchAddress; } __attribute__ ((packed)); #define USB_DT_ENDPOINT_SIZE 7 #define USB_DT_ENDPOINT_AUDIO_SIZE 9 /* Audio extension */ /* * Endpoints */ #define USB_ENDPOINT_NUMBER_MASK 0x0f /* in bEndpointAddress */ #define USB_ENDPOINT_DIR_MASK 0x80 #define USB_ENDPOINT_XFERTYPE_MASK 0x03 /* in bmAttributes */ #define USB_ENDPOINT_XFER_CONTROL 0 #define USB_ENDPOINT_XFER_ISOC 1 #define USB_ENDPOINT_XFER_BULK 2 #define USB_ENDPOINT_XFER_INT 3 #define USB_ENDPOINT_MAX_ADJUSTABLE 0x80 #define USB_ENDPOINT_MAXP_MASK 0x07ff #define USB_EP_MAXP_MULT_SHIFT 11 #define USB_EP_MAXP_MULT_MASK (3 << USB_EP_MAXP_MULT_SHIFT) #define USB_EP_MAXP_MULT(m) \ (((m) & USB_EP_MAXP_MULT_MASK) >> USB_EP_MAXP_MULT_SHIFT) /* The USB 3.0 spec redefines bits 5:4 of bmAttributes as interrupt ep type. */ #define USB_ENDPOINT_INTRTYPE 0x30 #define USB_ENDPOINT_INTR_PERIODIC (0 << 4) #define USB_ENDPOINT_INTR_NOTIFICATION (1 << 4) #define USB_ENDPOINT_SYNCTYPE 0x0c #define USB_ENDPOINT_SYNC_NONE (0 << 2) #define USB_ENDPOINT_SYNC_ASYNC (1 << 2) #define USB_ENDPOINT_SYNC_ADAPTIVE (2 << 2) #define USB_ENDPOINT_SYNC_SYNC (3 << 2) #define USB_ENDPOINT_USAGE_MASK 0x30 #define USB_ENDPOINT_USAGE_DATA 0x00 #define USB_ENDPOINT_USAGE_FEEDBACK 0x10 #define USB_ENDPOINT_USAGE_IMPLICIT_FB 0x20 /* Implicit feedback Data endpoint */ /*-------------------------------------------------------------------------*/ /** * usb_endpoint_num - get the endpoint's number * @epd: endpoint to be checked * * Returns @epd's number: 0 to 15. */ static inline int usb_endpoint_num(const struct usb_endpoint_descriptor *epd) { return epd->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK; } /** * usb_endpoint_type - get the endpoint's transfer type * @epd: endpoint to be checked * * Returns one of USB_ENDPOINT_XFER_{CONTROL, ISOC, BULK, INT} according * to @epd's transfer type. */ static inline int usb_endpoint_type(const struct usb_endpoint_descriptor *epd) { return epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK; } /** * usb_endpoint_dir_in - check if the endpoint has IN direction * @epd: endpoint to be checked * * Returns true if the endpoint is of type IN, otherwise it returns false. */ static inline int usb_endpoint_dir_in(const struct usb_endpoint_descriptor *epd) { return ((epd->bEndpointAddress & USB_ENDPOINT_DIR_MASK) == USB_DIR_IN); } /** * usb_endpoint_dir_out - check if the endpoint has OUT direction * @epd: endpoint to be checked * * Returns true if the endpoint is of type OUT, otherwise it returns false. */ static inline int usb_endpoint_dir_out( const struct usb_endpoint_descriptor *epd) { return ((epd->bEndpointAddress & USB_ENDPOINT_DIR_MASK) == USB_DIR_OUT); } /** * usb_endpoint_xfer_bulk - check if the endpoint has bulk transfer type * @epd: endpoint to be checked * * Returns true if the endpoint is of type bulk, otherwise it returns false. */ static inline int usb_endpoint_xfer_bulk( const struct usb_endpoint_descriptor *epd) { return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_BULK); } /** * usb_endpoint_xfer_control - check if the endpoint has control transfer type * @epd: endpoint to be checked * * Returns true if the endpoint is of type control, otherwise it returns false. */ static inline int usb_endpoint_xfer_control( const struct usb_endpoint_descriptor *epd) { return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_CONTROL); } /** * usb_endpoint_xfer_int - check if the endpoint has interrupt transfer type * @epd: endpoint to be checked * * Returns true if the endpoint is of type interrupt, otherwise it returns * false. */ static inline int usb_endpoint_xfer_int( const struct usb_endpoint_descriptor *epd) { return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_INT); } /** * usb_endpoint_xfer_isoc - check if the endpoint has isochronous transfer type * @epd: endpoint to be checked * * Returns true if the endpoint is of type isochronous, otherwise it returns * false. */ static inline int usb_endpoint_xfer_isoc( const struct usb_endpoint_descriptor *epd) { return ((epd->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) == USB_ENDPOINT_XFER_ISOC); } /** * usb_endpoint_is_bulk_in - check if the endpoint is bulk IN * @epd: endpoint to be checked * * Returns true if the endpoint has bulk transfer type and IN direction, * otherwise it returns false. */ static inline int usb_endpoint_is_bulk_in( const struct usb_endpoint_descriptor *epd) { return usb_endpoint_xfer_bulk(epd) && usb_endpoint_dir_in(epd); } /** * usb_endpoint_is_bulk_out - check if the endpoint is bulk OUT * @epd: endpoint to be checked * * Returns true if the endpoint has bulk transfer type and OUT direction, * otherwise it returns false. */ static inline int usb_endpoint_is_bulk_out( const struct usb_endpoint_descriptor *epd) { return usb_endpoint_xfer_bulk(epd) && usb_endpoint_dir_out(epd); } /** * usb_endpoint_is_int_in - check if the endpoint is interrupt IN * @epd: endpoint to be checked * * Returns true if the endpoint has interrupt transfer type and IN direction, * otherwise it returns false. */ static inline int usb_endpoint_is_int_in( const struct usb_endpoint_descriptor *epd) { return usb_endpoint_xfer_int(epd) && usb_endpoint_dir_in(epd); } /** * usb_endpoint_is_int_out - check if the endpoint is interrupt OUT * @epd: endpoint to be checked * * Returns true if the endpoint has interrupt transfer type and OUT direction, * otherwise it returns false. */ static inline int usb_endpoint_is_int_out( const struct usb_endpoint_descriptor *epd) { return usb_endpoint_xfer_int(epd) && usb_endpoint_dir_out(epd); } /** * usb_endpoint_is_isoc_in - check if the endpoint is isochronous IN * @epd: endpoint to be checked * * Returns true if the endpoint has isochronous transfer type and IN direction, * otherwise it returns false. */ static inline int usb_endpoint_is_isoc_in( const struct usb_endpoint_descriptor *epd) { return usb_endpoint_xfer_isoc(epd) && usb_endpoint_dir_in(epd); } /** * usb_endpoint_is_isoc_out - check if the endpoint is isochronous OUT * @epd: endpoint to be checked * * Returns true if the endpoint has isochronous transfer type and OUT direction, * otherwise it returns false. */ static inline int usb_endpoint_is_isoc_out( const struct usb_endpoint_descriptor *epd) { return usb_endpoint_xfer_isoc(epd) && usb_endpoint_dir_out(epd); } /** * usb_endpoint_maxp - get endpoint's max packet size * @epd: endpoint to be checked * * Returns @epd's max packet bits [10:0] */ static inline int usb_endpoint_maxp(const struct usb_endpoint_descriptor *epd) { return __le16_to_cpu(epd->wMaxPacketSize) & USB_ENDPOINT_MAXP_MASK; } /** * usb_endpoint_maxp_mult - get endpoint's transactional opportunities * @epd: endpoint to be checked * * Return @epd's wMaxPacketSize[12:11] + 1 */ static inline int usb_endpoint_maxp_mult(const struct usb_endpoint_descriptor *epd) { int maxp = __le16_to_cpu(epd->wMaxPacketSize); return USB_EP_MAXP_MULT(maxp) + 1; } static inline int usb_endpoint_interrupt_type( const struct usb_endpoint_descriptor *epd) { return epd->bmAttributes & USB_ENDPOINT_INTRTYPE; } /*-------------------------------------------------------------------------*/ /* USB_DT_EUSB2_ISOC_ENDPOINT_COMP: eUSB2 Isoch Endpoint Companion descriptor */ struct usb_eusb2_isoc_ep_comp_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 wMaxPacketSize; __le32 dwBytesPerInterval; } __attribute__ ((packed)); #define USB_DT_EUSB2_ISOC_EP_COMP_SIZE 8 /*-------------------------------------------------------------------------*/ /* USB_DT_SSP_ISOC_ENDPOINT_COMP: SuperSpeedPlus Isochronous Endpoint Companion * descriptor */ struct usb_ssp_isoc_ep_comp_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 wReseved; __le32 dwBytesPerInterval; } __attribute__ ((packed)); #define USB_DT_SSP_ISOC_EP_COMP_SIZE 8 /*-------------------------------------------------------------------------*/ /* USB_DT_SS_ENDPOINT_COMP: SuperSpeed Endpoint Companion descriptor */ struct usb_ss_ep_comp_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bMaxBurst; __u8 bmAttributes; __le16 wBytesPerInterval; } __attribute__ ((packed)); #define USB_DT_SS_EP_COMP_SIZE 6 /* Bits 4:0 of bmAttributes if this is a bulk endpoint */ static inline int usb_ss_max_streams(const struct usb_ss_ep_comp_descriptor *comp) { int max_streams; if (!comp) return 0; max_streams = comp->bmAttributes & 0x1f; if (!max_streams) return 0; max_streams = 1 << max_streams; return max_streams; } /* Bits 1:0 of bmAttributes if this is an isoc endpoint */ #define USB_SS_MULT(p) (1 + ((p) & 0x3)) /* Bit 7 of bmAttributes if a SSP isoc endpoint companion descriptor exists */ #define USB_SS_SSP_ISOC_COMP(p) ((p) & (1 << 7)) /*-------------------------------------------------------------------------*/ /* USB_DT_DEVICE_QUALIFIER: Device Qualifier descriptor */ struct usb_qualifier_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 bcdUSB; __u8 bDeviceClass; __u8 bDeviceSubClass; __u8 bDeviceProtocol; __u8 bMaxPacketSize0; __u8 bNumConfigurations; __u8 bRESERVED; } __attribute__ ((packed)); /*-------------------------------------------------------------------------*/ /* USB_DT_OTG (from OTG 1.0a supplement) */ struct usb_otg_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bmAttributes; /* support for HNP, SRP, etc */ } __attribute__ ((packed)); /* USB_DT_OTG (from OTG 2.0 supplement) */ struct usb_otg20_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bmAttributes; /* support for HNP, SRP and ADP, etc */ __le16 bcdOTG; /* OTG and EH supplement release number * in binary-coded decimal(i.e. 2.0 is 0200H) */ } __attribute__ ((packed)); /* from usb_otg_descriptor.bmAttributes */ #define USB_OTG_SRP (1 << 0) #define USB_OTG_HNP (1 << 1) /* swap host/device roles */ #define USB_OTG_ADP (1 << 2) /* support ADP */ /* OTG 3.0 */ #define USB_OTG_RSP (1 << 3) /* support RSP */ #define OTG_STS_SELECTOR 0xF000 /* OTG status selector */ /*-------------------------------------------------------------------------*/ /* USB_DT_DEBUG: for special highspeed devices, replacing serial console */ struct usb_debug_descriptor { __u8 bLength; __u8 bDescriptorType; /* bulk endpoints with 8 byte maxpacket */ __u8 bDebugInEndpoint; __u8 bDebugOutEndpoint; } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB_DT_INTERFACE_ASSOCIATION: groups interfaces */ struct usb_interface_assoc_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bFirstInterface; __u8 bInterfaceCount; __u8 bFunctionClass; __u8 bFunctionSubClass; __u8 bFunctionProtocol; __u8 iFunction; } __attribute__ ((packed)); #define USB_DT_INTERFACE_ASSOCIATION_SIZE 8 /*-------------------------------------------------------------------------*/ /* USB_DT_SECURITY: group of wireless security descriptors, including * encryption types available for setting up a CC/association. */ struct usb_security_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 wTotalLength; __u8 bNumEncryptionTypes; } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB_DT_KEY: used with {GET,SET}_SECURITY_DATA; only public keys * may be retrieved. */ struct usb_key_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 tTKID[3]; __u8 bReserved; __u8 bKeyData[]; } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB_DT_ENCRYPTION_TYPE: bundled in DT_SECURITY groups */ struct usb_encryption_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bEncryptionType; #define USB_ENC_TYPE_UNSECURE 0 #define USB_ENC_TYPE_WIRED 1 /* non-wireless mode */ #define USB_ENC_TYPE_CCM_1 2 /* aes128/cbc session */ #define USB_ENC_TYPE_RSA_1 3 /* rsa3072/sha1 auth */ __u8 bEncryptionValue; /* use in SET_ENCRYPTION */ __u8 bAuthKeyIndex; } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB_DT_BOS: group of device-level capabilities */ struct usb_bos_descriptor { __u8 bLength; __u8 bDescriptorType; __le16 wTotalLength; __u8 bNumDeviceCaps; } __attribute__((packed)); #define USB_DT_BOS_SIZE 5 /*-------------------------------------------------------------------------*/ /* USB_DT_DEVICE_CAPABILITY: grouped with BOS */ struct usb_dev_cap_header { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; } __attribute__((packed)); #define USB_CAP_TYPE_WIRELESS_USB 1 struct usb_wireless_cap_descriptor { /* Ultra Wide Band */ __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bmAttributes; #define USB_WIRELESS_P2P_DRD (1 << 1) #define USB_WIRELESS_BEACON_MASK (3 << 2) #define USB_WIRELESS_BEACON_SELF (1 << 2) #define USB_WIRELESS_BEACON_DIRECTED (2 << 2) #define USB_WIRELESS_BEACON_NONE (3 << 2) __le16 wPHYRates; /* bit rates, Mbps */ #define USB_WIRELESS_PHY_53 (1 << 0) /* always set */ #define USB_WIRELESS_PHY_80 (1 << 1) #define USB_WIRELESS_PHY_107 (1 << 2) /* always set */ #define USB_WIRELESS_PHY_160 (1 << 3) #define USB_WIRELESS_PHY_200 (1 << 4) /* always set */ #define USB_WIRELESS_PHY_320 (1 << 5) #define USB_WIRELESS_PHY_400 (1 << 6) #define USB_WIRELESS_PHY_480 (1 << 7) __u8 bmTFITXPowerInfo; /* TFI power levels */ __u8 bmFFITXPowerInfo; /* FFI power levels */ __le16 bmBandGroup; __u8 bReserved; } __attribute__((packed)); #define USB_DT_USB_WIRELESS_CAP_SIZE 11 /* USB 2.0 Extension descriptor */ #define USB_CAP_TYPE_EXT 2 struct usb_ext_cap_descriptor { /* Link Power Management */ __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __le32 bmAttributes; #define USB_LPM_SUPPORT (1 << 1) /* supports LPM */ #define USB_BESL_SUPPORT (1 << 2) /* supports BESL */ #define USB_BESL_BASELINE_VALID (1 << 3) /* Baseline BESL valid*/ #define USB_BESL_DEEP_VALID (1 << 4) /* Deep BESL valid */ #define USB_SET_BESL_BASELINE(p) (((p) & 0xf) << 8) #define USB_SET_BESL_DEEP(p) (((p) & 0xf) << 12) #define USB_GET_BESL_BASELINE(p) (((p) & (0xf << 8)) >> 8) #define USB_GET_BESL_DEEP(p) (((p) & (0xf << 12)) >> 12) } __attribute__((packed)); #define USB_DT_USB_EXT_CAP_SIZE 7 /* * SuperSpeed USB Capability descriptor: Defines the set of SuperSpeed USB * specific device level capabilities */ #define USB_SS_CAP_TYPE 3 struct usb_ss_cap_descriptor { /* Link Power Management */ __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bmAttributes; #define USB_LTM_SUPPORT (1 << 1) /* supports LTM */ __le16 wSpeedSupported; #define USB_LOW_SPEED_OPERATION (1) /* Low speed operation */ #define USB_FULL_SPEED_OPERATION (1 << 1) /* Full speed operation */ #define USB_HIGH_SPEED_OPERATION (1 << 2) /* High speed operation */ #define USB_5GBPS_OPERATION (1 << 3) /* Operation at 5Gbps */ __u8 bFunctionalitySupport; __u8 bU1devExitLat; __le16 bU2DevExitLat; } __attribute__((packed)); #define USB_DT_USB_SS_CAP_SIZE 10 /* * Container ID Capability descriptor: Defines the instance unique ID used to * identify the instance across all operating modes */ #define CONTAINER_ID_TYPE 4 struct usb_ss_container_id_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bReserved; __u8 ContainerID[16]; /* 128-bit number */ } __attribute__((packed)); #define USB_DT_USB_SS_CONTN_ID_SIZE 20 /* * Platform Device Capability descriptor: Defines platform specific device * capabilities */ #define USB_PLAT_DEV_CAP_TYPE 5 struct usb_plat_dev_cap_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bReserved; __u8 UUID[16]; __u8 CapabilityData[]; } __attribute__((packed)); #define USB_DT_USB_PLAT_DEV_CAP_SIZE(capability_data_size) (20 + capability_data_size) /* * SuperSpeed Plus USB Capability descriptor: Defines the set of * SuperSpeed Plus USB specific device level capabilities */ #define USB_SSP_CAP_TYPE 0xa struct usb_ssp_cap_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bReserved; __le32 bmAttributes; #define USB_SSP_SUBLINK_SPEED_ATTRIBS (0x1f << 0) /* sublink speed entries */ #define USB_SSP_SUBLINK_SPEED_IDS (0xf << 5) /* speed ID entries */ __le16 wFunctionalitySupport; #define USB_SSP_MIN_SUBLINK_SPEED_ATTRIBUTE_ID (0xf) #define USB_SSP_MIN_RX_LANE_COUNT (0xf << 8) #define USB_SSP_MIN_TX_LANE_COUNT (0xf << 12) __le16 wReserved; union { __le32 legacy_padding; /* list of sublink speed attrib entries */ __DECLARE_FLEX_ARRAY(__le32, bmSublinkSpeedAttr); }; #define USB_SSP_SUBLINK_SPEED_SSID (0xf) /* sublink speed ID */ #define USB_SSP_SUBLINK_SPEED_LSE (0x3 << 4) /* Lanespeed exponent */ #define USB_SSP_SUBLINK_SPEED_LSE_BPS 0 #define USB_SSP_SUBLINK_SPEED_LSE_KBPS 1 #define USB_SSP_SUBLINK_SPEED_LSE_MBPS 2 #define USB_SSP_SUBLINK_SPEED_LSE_GBPS 3 #define USB_SSP_SUBLINK_SPEED_ST (0x3 << 6) /* Sublink type */ #define USB_SSP_SUBLINK_SPEED_ST_SYM_RX 0 #define USB_SSP_SUBLINK_SPEED_ST_ASYM_RX 1 #define USB_SSP_SUBLINK_SPEED_ST_SYM_TX 2 #define USB_SSP_SUBLINK_SPEED_ST_ASYM_TX 3 #define USB_SSP_SUBLINK_SPEED_RSVD (0x3f << 8) /* Reserved */ #define USB_SSP_SUBLINK_SPEED_LP (0x3 << 14) /* Link protocol */ #define USB_SSP_SUBLINK_SPEED_LP_SS 0 #define USB_SSP_SUBLINK_SPEED_LP_SSP 1 #define USB_SSP_SUBLINK_SPEED_LSM (0xff << 16) /* Lanespeed mantissa */ } __attribute__((packed)); /* * USB Power Delivery Capability Descriptor: * Defines capabilities for PD */ /* Defines the various PD Capabilities of this device */ #define USB_PD_POWER_DELIVERY_CAPABILITY 0x06 /* Provides information on each battery supported by the device */ #define USB_PD_BATTERY_INFO_CAPABILITY 0x07 /* The Consumer characteristics of a Port on the device */ #define USB_PD_PD_CONSUMER_PORT_CAPABILITY 0x08 /* The provider characteristics of a Port on the device */ #define USB_PD_PD_PROVIDER_PORT_CAPABILITY 0x09 struct usb_pd_cap_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; /* set to USB_PD_POWER_DELIVERY_CAPABILITY */ __u8 bReserved; __le32 bmAttributes; #define USB_PD_CAP_BATTERY_CHARGING (1 << 1) /* supports Battery Charging specification */ #define USB_PD_CAP_USB_PD (1 << 2) /* supports USB Power Delivery specification */ #define USB_PD_CAP_PROVIDER (1 << 3) /* can provide power */ #define USB_PD_CAP_CONSUMER (1 << 4) /* can consume power */ #define USB_PD_CAP_CHARGING_POLICY (1 << 5) /* supports CHARGING_POLICY feature */ #define USB_PD_CAP_TYPE_C_CURRENT (1 << 6) /* supports power capabilities defined in the USB Type-C Specification */ #define USB_PD_CAP_PWR_AC (1 << 8) #define USB_PD_CAP_PWR_BAT (1 << 9) #define USB_PD_CAP_PWR_USE_V_BUS (1 << 14) __le16 bmProviderPorts; /* Bit zero refers to the UFP of the device */ __le16 bmConsumerPorts; __le16 bcdBCVersion; __le16 bcdPDVersion; __le16 bcdUSBTypeCVersion; } __attribute__((packed)); struct usb_pd_cap_battery_info_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; /* Index of string descriptor shall contain the user friendly name for this battery */ __u8 iBattery; /* Index of string descriptor shall contain the Serial Number String for this battery */ __u8 iSerial; __u8 iManufacturer; __u8 bBatteryId; /* uniquely identifies this battery in status Messages */ __u8 bReserved; /* * Shall contain the Battery Charge value above which this * battery is considered to be fully charged but not necessarily * “topped off.” */ __le32 dwChargedThreshold; /* in mWh */ /* * Shall contain the minimum charge level of this battery such * that above this threshold, a device can be assured of being * able to power up successfully (see Battery Charging 1.2). */ __le32 dwWeakThreshold; /* in mWh */ __le32 dwBatteryDesignCapacity; /* in mWh */ __le32 dwBatteryLastFullchargeCapacity; /* in mWh */ } __attribute__((packed)); struct usb_pd_cap_consumer_port_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bReserved; __u8 bmCapabilities; /* port will oerate under: */ #define USB_PD_CAP_CONSUMER_BC (1 << 0) /* BC */ #define USB_PD_CAP_CONSUMER_PD (1 << 1) /* PD */ #define USB_PD_CAP_CONSUMER_TYPE_C (1 << 2) /* USB Type-C Current */ __le16 wMinVoltage; /* in 50mV units */ __le16 wMaxVoltage; /* in 50mV units */ __u16 wReserved; __le32 dwMaxOperatingPower; /* in 10 mW - operating at steady state */ __le32 dwMaxPeakPower; /* in 10mW units - operating at peak power */ __le32 dwMaxPeakPowerTime; /* in 100ms units - duration of peak */ #define USB_PD_CAP_CONSUMER_UNKNOWN_PEAK_POWER_TIME 0xffff } __attribute__((packed)); struct usb_pd_cap_provider_port_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; __u8 bReserved1; __u8 bmCapabilities; /* port will oerate under: */ #define USB_PD_CAP_PROVIDER_BC (1 << 0) /* BC */ #define USB_PD_CAP_PROVIDER_PD (1 << 1) /* PD */ #define USB_PD_CAP_PROVIDER_TYPE_C (1 << 2) /* USB Type-C Current */ __u8 bNumOfPDObjects; __u8 bReserved2; __le32 wPowerDataObject[]; } __attribute__((packed)); /* * Precision time measurement capability descriptor: advertised by devices and * hubs that support PTM */ #define USB_PTM_CAP_TYPE 0xb struct usb_ptm_cap_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bDevCapabilityType; } __attribute__((packed)); #define USB_DT_USB_PTM_ID_SIZE 3 /* * The size of the descriptor for the Sublink Speed Attribute Count * (SSAC) specified in bmAttributes[4:0]. SSAC is zero-based */ #define USB_DT_USB_SSP_CAP_SIZE(ssac) (12 + (ssac + 1) * 4) /*-------------------------------------------------------------------------*/ /* USB_DT_WIRELESS_ENDPOINT_COMP: companion descriptor associated with * each endpoint descriptor for a wireless device */ struct usb_wireless_ep_comp_descriptor { __u8 bLength; __u8 bDescriptorType; __u8 bMaxBurst; __u8 bMaxSequence; __le16 wMaxStreamDelay; __le16 wOverTheAirPacketSize; __u8 bOverTheAirInterval; __u8 bmCompAttributes; #define USB_ENDPOINT_SWITCH_MASK 0x03 /* in bmCompAttributes */ #define USB_ENDPOINT_SWITCH_NO 0 #define USB_ENDPOINT_SWITCH_SWITCH 1 #define USB_ENDPOINT_SWITCH_SCALE 2 } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB_REQ_SET_HANDSHAKE is a four-way handshake used between a wireless * host and a device for connection set up, mutual authentication, and * exchanging short lived session keys. The handshake depends on a CC. */ struct usb_handshake { __u8 bMessageNumber; __u8 bStatus; __u8 tTKID[3]; __u8 bReserved; __u8 CDID[16]; __u8 nonce[16]; __u8 MIC[8]; } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB_REQ_SET_CONNECTION modifies or revokes a connection context (CC). * A CC may also be set up using non-wireless secure channels (including * wired USB!), and some devices may support CCs with multiple hosts. */ struct usb_connection_context { __u8 CHID[16]; /* persistent host id */ __u8 CDID[16]; /* device id (unique w/in host context) */ __u8 CK[16]; /* connection key */ } __attribute__((packed)); /*-------------------------------------------------------------------------*/ /* USB 2.0 defines three speeds, here's how Linux identifies them */ enum usb_device_speed { USB_SPEED_UNKNOWN = 0, /* enumerating */ USB_SPEED_LOW, USB_SPEED_FULL, /* usb 1.1 */ USB_SPEED_HIGH, /* usb 2.0 */ USB_SPEED_WIRELESS, /* wireless (usb 2.5) */ USB_SPEED_SUPER, /* usb 3.0 */ USB_SPEED_SUPER_PLUS, /* usb 3.1 */ }; enum usb_device_state { /* NOTATTACHED isn't in the USB spec, and this state acts * the same as ATTACHED ... but it's clearer this way. */ USB_STATE_NOTATTACHED = 0, /* chapter 9 and authentication (wireless) device states */ USB_STATE_ATTACHED, USB_STATE_POWERED, /* wired */ USB_STATE_RECONNECTING, /* auth */ USB_STATE_UNAUTHENTICATED, /* auth */ USB_STATE_DEFAULT, /* limited function */ USB_STATE_ADDRESS, USB_STATE_CONFIGURED, /* most functions */ USB_STATE_SUSPENDED /* NOTE: there are actually four different SUSPENDED * states, returning to POWERED, DEFAULT, ADDRESS, or * CONFIGURED respectively when SOF tokens flow again. * At this level there's no difference between L1 and L2 * suspend states. (L2 being original USB 1.1 suspend.) */ }; enum usb3_link_state { USB3_LPM_U0 = 0, USB3_LPM_U1, USB3_LPM_U2, USB3_LPM_U3 }; /* * A U1 timeout of 0x0 means the parent hub will reject any transitions to U1. * 0xff means the parent hub will accept transitions to U1, but will not * initiate a transition. * * A U1 timeout of 0x1 to 0x7F also causes the hub to initiate a transition to * U1 after that many microseconds. Timeouts of 0x80 to 0xFE are reserved * values. * * A U2 timeout of 0x0 means the parent hub will reject any transitions to U2. * 0xff means the parent hub will accept transitions to U2, but will not * initiate a transition. * * A U2 timeout of 0x1 to 0xFE also causes the hub to initiate a transition to * U2 after N*256 microseconds. Therefore a U2 timeout value of 0x1 means a U2 * idle timer of 256 microseconds, 0x2 means 512 microseconds, 0xFE means * 65.024ms. */ #define USB3_LPM_DISABLED 0x0 #define USB3_LPM_U1_MAX_TIMEOUT 0x7F #define USB3_LPM_U2_MAX_TIMEOUT 0xFE #define USB3_LPM_DEVICE_INITIATED 0xFF struct usb_set_sel_req { __u8 u1_sel; __u8 u1_pel; __le16 u2_sel; __le16 u2_pel; } __attribute__ ((packed)); /* * The Set System Exit Latency control transfer provides one byte each for * U1 SEL and U1 PEL, so the max exit latency is 0xFF. U2 SEL and U2 PEL each * are two bytes long. */ #define USB3_LPM_MAX_U1_SEL_PEL 0xFF #define USB3_LPM_MAX_U2_SEL_PEL 0xFFFF /*-------------------------------------------------------------------------*/ /* * As per USB compliance update, a device that is actively drawing * more than 100mA from USB must report itself as bus-powered in * the GetStatus(DEVICE) call. * https://compliance.usb.org/index.asp?UpdateFile=Electrical&Format=Standard#34 */ #define USB_SELF_POWER_VBUS_MAX_DRAW 100 #endif /* _UAPI__LINUX_USB_CH9_H */ |
616 8 290 151 277 290 349 1 426 426 6 6 6 6 161 201 213 118 393 18 21 391 78 34 592 73 525 10 213 197 62 392 5 6 391 21 139 6 9 416 9 382 10 3 3 8 4 10 681 16 620 148 695 681 15 392 507 138 339 418 685 12 467 430 689 178 607 14 199 14 199 692 695 408 16 81 63 18 424 418 426 416 426 418 8 318 199 424 291 133 80 76 5 9 1 8 77 5 81 81 81 452 442 9 115 337 329 103 24 423 2 177 44 1 39 8 3288 3269 223 43 230 35 233 30 238 66 219 595 594 594 590 42 42 42 532 60 26 556 613 366 562 564 611 426 391 75 610 8 614 610 613 599 56 19 392 392 426 598 15 614 424 393 579 237 613 613 611 612 595 168 413 556 43 70 70 118 48 76 26 26 1 26 1 2 24 26 26 26 91 100 1 32 32 32 32 32 32 32 32 32 23 28 32 21 22 4 4 1 23 23 23 23 23 4 23 310 510 527 264 575 8 580 345 9 343 336 9 344 2 1 2 246 162 53 288 76 233 236 4 78 187 13 38 14 133 238 20 8 109 25 289 66 52 47 29 29 28 91 128 90 10 3 100 99 100 100 99 100 30 70 43 36 27 28 9 6 86 85 86 315 263 23 8 19 2 46 236 262 1 262 2 2 115 206 970 234 166 306 1083 758 901 757 85 243 29 3 48 86 314 47 315 313 251 285 29 286 283 78 105 274 279 240 22 101 48 66 246 100 263 262 290 95 274 137 290 78 278 38 2 289 152 374 301 256 7 128 191 70 245 247 247 44 37 13 13 2 12 3 2 1 4 314 290 69 46 46 394 272 123 392 4 393 93 93 367 16 21 31 91 1 2 2 2 2 11 36 35 34 36 11 34 11 2 50 50 50 44 4 4 41 1 42 1 26 41 41 22 41 29 36 43 43 43 43 43 29 40 30 39 30 39 47 47 40 47 46 40 27 19 16 46 46 2 40 26 40 47 38 549 124 568 571 76 77 7 70 70 26 26 25 4 26 145 9 135 136 145 81 8 77 81 80 1 7 77 80 81 81 81 81 520 520 11 521 521 520 2 519 519 4 521 521 425 97 425 255 197 57 57 5 19 33 32 5 32 38 5 33 32 16 233 218 61 521 2 521 96 425 425 256 168 418 16 24 133 109 92 57 57 109 108 3 46 335 1 335 217 219 30 30 18 30 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 | // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ /* * Changes: Pedro Roque : Retransmit queue handled by TCP. * : Fragmentation on mtu decrease * : Segment collapse on retransmit * : AF independence * * Linus Torvalds : send_delayed_ack * David S. Miller : Charge memory using the right skb * during syn/ack processing. * David S. Miller : Output engine completely rewritten. * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. * Cacophonix Gaul : draft-minshall-nagle-01 * J Hadi Salim : ECN support * */ #define pr_fmt(fmt) "TCP: " fmt #include <net/tcp.h> #include <net/mptcp.h> #include <net/proto_memory.h> #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/module.h> #include <linux/static_key.h> #include <linux/skbuff_ref.h> #include <trace/events/tcp.h> /* Refresh clocks of a TCP socket, * ensuring monotically increasing values. */ void tcp_mstamp_refresh(struct tcp_sock *tp) { u64 val = tcp_clock_ns(); tp->tcp_clock_cache = val; tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC); } static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq); __skb_unlink(skb, &sk->sk_write_queue); tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); if (tp->highest_sack == NULL) tp->highest_sack = skb; tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, tcp_skb_pcount(skb)); tcp_check_space(sk); } /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one * window scaling factor due to loss of precision. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */ static inline __u32 tcp_acceptable_seq(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (!before(tcp_wnd_end(tp), tp->snd_nxt) || (tp->rx_opt.wscale_ok && ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale)))) return tp->snd_nxt; else return tcp_wnd_end(tp); } /* Calculate mss to advertise in SYN segment. * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: * * 1. It is independent of path mtu. * 2. Ideally, it is maximal possible segment size i.e. 65535-40. * 3. For IPv4 it is reasonable to calculate it from maximal MTU of * attached devices, because some buggy hosts are confused by * large MSS. * 4. We do not make 3, we advertise MSS, calculated from first * hop device mtu, but allow to raise it to ip_rt_min_advmss. * This may be overridden via information stored in routing table. * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, * probably even Jumbo". */ static __u16 tcp_advertise_mss(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); const struct dst_entry *dst = __sk_dst_get(sk); int mss = tp->advmss; if (dst) { unsigned int metric = dst_metric_advmss(dst); if (metric < mss) { mss = metric; tp->advmss = mss; } } return (__u16)mss; } /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ void tcp_cwnd_restart(struct sock *sk, s32 delta) { struct tcp_sock *tp = tcp_sk(sk); u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 cwnd = tcp_snd_cwnd(tp); tcp_ca_event(sk, CA_EVENT_CWND_RESTART); tp->snd_ssthresh = tcp_current_ssthresh(sk); restart_cwnd = min(restart_cwnd, cwnd); while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd)); tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_cwnd_used = 0; } /* Congestion state accounting after a packet has been sent. */ static void tcp_event_data_sent(struct tcp_sock *tp, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_jiffies32; if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, increase pingpong count. */ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) inet_csk_inc_pingpong_cnt(sk); } /* Account for an ACK we sent. */ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt) { struct tcp_sock *tp = tcp_sk(sk); if (unlikely(tp->compressed_ack)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, tp->compressed_ack); tp->compressed_ack = 0; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } if (unlikely(rcv_nxt != tp->rcv_nxt)) return; /* Special ACK sent by DCTCP to reflect ECN */ tcp_dec_quickack_mode(sk); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } /* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. * NOTE: for smooth operation initial space offering should * be a multiple of mss if possible. We assume here that mss >= 1. * This MUST be enforced by all callers. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *__window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); u32 window_clamp = READ_ONCE(*__window_clamp); /* If no clamp set the clamp to the max possible scaled window */ if (window_clamp == 0) window_clamp = (U16_MAX << TCP_MAX_WSCALE); space = min(window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) space = rounddown(space, mss); /* NOTE: offering an initial window larger than 32767 * will break some buggy TCP stacks. If the admin tells us * it is likely we could be speaking with such a buggy stack * we will truncate our initial window offering to 32K-1 * unless the remote has sent us a window scaling option, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = space; if (init_rcv_wnd) *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); space = min_t(u32, space, window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); } /* Set the clamp no higher than max representable value */ WRITE_ONCE(*__window_clamp, min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp)); } EXPORT_IPV6_MOD(tcp_select_initial_window); /* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied. The return * value can be stuffed directly into th->window for an outgoing * frame. */ static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 old_win = tp->rcv_wnd; u32 cur_win, new_win; /* Make the window 0 if we failed to queue the data because we * are out of memory. */ if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) { tp->pred_flags = 0; tp->rcv_wnd = 0; tp->rcv_wup = tp->rcv_nxt; return 0; } cur_win = tcp_receive_window(tp); new_win = __tcp_select_window(sk); if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else * we will not be able to advertise a zero * window in time. --DaveM * * Relax Will Robinson. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) { /* Never shrink the offered window */ if (new_win == 0) NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV); new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; /* Make sure we do not exceed the maximum possible * scaled window. */ if (!tp->rx_opt.rcv_wscale && READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. */ if (new_win == 0) { tp->pred_flags = 0; if (old_win) NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV); } else if (old_win == 0) { NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV); } return new_win; } /* Packet ECN state for a SYN-ACK */ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (tcp_ecn_disabled(tp)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; else if (tcp_ca_needs_ecn(sk) || tcp_bpf_ca_needs_ecn(sk)) INET_ECN_xmit(sk); } /* Packet ECN state for a SYN. */ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || tcp_ca_needs_ecn(sk) || bpf_needs_ecn; if (!use_ecn) { const struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) use_ecn = true; } tp->ecn_flags = 0; if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); } } static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) { if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) /* tp->ecn_flags are cleared at a later point in time when * SYN ACK is ultimatively being received. */ TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); } static void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { if (inet_rsk(req)->ecn_ok) th->ece = 1; } /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, int tcp_header_len) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_ecn_mode_rfc3168(tp)) { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { INET_ECN_xmit(sk); if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } } else if (!tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) th->ece = 1; } } /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u16 flags) { skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; tcp_skb_pcount_set(skb, 1); TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) seq++; TCP_SKB_CB(skb)->end_seq = seq; } static inline bool tcp_urg_mode(const struct tcp_sock *tp) { return tp->snd_una != tp->snd_up; } #define OPTION_SACK_ADVERTISE BIT(0) #define OPTION_TS BIT(1) #define OPTION_MD5 BIT(2) #define OPTION_WSCALE BIT(3) #define OPTION_FAST_OPEN_COOKIE BIT(8) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) static void smc_options_write(__be32 *ptr, u16 *options) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (unlikely(OPTION_SMC & *options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_EXP << 8) | (TCPOLEN_EXP_SMC_BASE)); *ptr++ = htonl(TCPOPT_SMC_MAGIC); } } #endif } struct tcp_out_options { u16 options; /* bit field of OPTION_* */ u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ struct mptcp_out_options mptcp; }; static void mptcp_options_write(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct tcp_out_options *opts) { #if IS_ENABLED(CONFIG_MPTCP) if (unlikely(OPTION_MPTCP & opts->options)) mptcp_write_options(th, ptr, tp, &opts->mptcp); #endif } #ifdef CONFIG_CGROUP_BPF static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb, enum tcp_synack_type synack_type) { if (unlikely(!skb)) return BPF_WRITE_HDR_TCP_CURRENT_MSS; if (unlikely(synack_type == TCP_SYNACK_COOKIE)) return BPF_WRITE_HDR_TCP_SYNACK_COOKIE; return 0; } /* req, syn_skb and synack_type are used when writing synack */ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts, unsigned int *remaining) { struct bpf_sock_ops_kern sock_ops; int err; if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || !*remaining) return; /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */ /* init sock_ops */ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB; if (req) { /* The listen "sk" cannot be passed here because * it is not locked. It would not make too much * sense to do bpf_setsockopt(listen_sk) based * on individual connection request also. * * Thus, "req" is passed here and the cgroup-bpf-progs * of the listen "sk" will be run. * * "req" is also used here for fastopen even the "sk" here is * a fullsock "child" sk. It is to keep the behavior * consistent between fastopen and non-fastopen on * the bpf programming side. */ sock_ops.sk = (struct sock *)req; sock_ops.syn_skb = syn_skb; } else { sock_owned_by_me(sk); sock_ops.is_fullsock = 1; sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); sock_ops.remaining_opt_len = *remaining; /* tcp_current_mss() does not pass a skb */ if (skb) bpf_skops_init_skb(&sock_ops, skb, 0); err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); if (err || sock_ops.remaining_opt_len == *remaining) return; opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len; /* round up to 4 bytes */ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3; *remaining -= opts->bpf_opt_len; } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len; struct bpf_sock_ops_kern sock_ops; int err; if (likely(!max_opt_len)) return; memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB; if (req) { sock_ops.sk = (struct sock *)req; sock_ops.syn_skb = syn_skb; } else { sock_owned_by_me(sk); sock_ops.is_fullsock = 1; sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); sock_ops.remaining_opt_len = max_opt_len; first_opt_off = tcp_hdrlen(skb) - max_opt_len; bpf_skops_init_skb(&sock_ops, skb, first_opt_off); err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); if (err) nr_written = 0; else nr_written = max_opt_len - sock_ops.remaining_opt_len; if (nr_written < max_opt_len) memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP, max_opt_len - nr_written); } #else static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts, unsigned int *remaining) { } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct sk_buff *syn_skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { } #endif static __be32 *process_tcp_ao_options(struct tcp_sock *tp, const struct tcp_request_sock *tcprsk, struct tcp_out_options *opts, struct tcp_key *key, __be32 *ptr) { #ifdef CONFIG_TCP_AO u8 maclen = tcp_ao_maclen(key->ao_key); if (tcprsk) { u8 aolen = maclen + sizeof(struct tcp_ao_hdr); *ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) | (tcprsk->ao_keyid << 8) | (tcprsk->ao_rcv_next)); } else { struct tcp_ao_key *rnext_key; struct tcp_ao_info *ao_info; ao_info = rcu_dereference_check(tp->ao_info, lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk)); rnext_key = READ_ONCE(ao_info->rnext_key); if (WARN_ON_ONCE(!rnext_key)) return ptr; *ptr++ = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key->ao_key) << 16) | (key->ao_key->sndid << 8) | (rnext_key->rcvid)); } opts->hash_location = (__u8 *)ptr; ptr += maclen / sizeof(*ptr); if (unlikely(maclen % sizeof(*ptr))) { memset(ptr, TCPOPT_NOP, sizeof(*ptr)); ptr++; } #endif return ptr; } /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of * TCP options, we learned this through the hard way, so be careful here. * Luckily we can at least blame others for their non-compliance but from * inter-operability perspective it seems that we're somewhat stuck with * the ordering which we have been using if we want to keep working with * those broken things (not that it currently hurts anybody as there isn't * particular reason why the ordering would need to be changed). * * At least SACK_PERM as the first option is known to lead to a disaster * (but it may well be that other scenarios fail similarly). */ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, const struct tcp_request_sock *tcprsk, struct tcp_out_options *opts, struct tcp_key *key) { __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ if (tcp_key_is_md5(key)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); /* overload cookie hash location */ opts->hash_location = (__u8 *)ptr; ptr += 4; } else if (tcp_key_is_ao(key)) { ptr = process_tcp_ao_options(tp, tcprsk, opts, key, ptr); } if (unlikely(opts->mss)) { *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | opts->mss); } if (likely(OPTION_TS & options)) { if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); options &= ~OPTION_SACK_ADVERTISE; } else { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); } *ptr++ = htonl(opts->tsval); *ptr++ = htonl(opts->tsecr); } if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); } if (unlikely(OPTION_WSCALE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); } if (unlikely(opts->num_sack_blocks)) { struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; int this_sack; *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { *ptr++ = htonl(sp[this_sack].start_seq); *ptr++ = htonl(sp[this_sack].end_seq); } tp->rx_opt.dsack = 0; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { struct tcp_fastopen_cookie *foc = opts->fastopen_cookie; u8 *p = (u8 *)ptr; u32 len; /* Fast Open option length */ if (foc->exp) { len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len; *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | TCPOPT_FASTOPEN_MAGIC); p += TCPOLEN_EXP_FASTOPEN_BASE; } else { len = TCPOLEN_FASTOPEN_BASE + foc->len; *p++ = TCPOPT_FASTOPEN; *p++ = len; } memcpy(p, foc->val, foc->len); if ((len & 3) == 2) { p[foc->len] = TCPOPT_NOP; p[foc->len + 1] = TCPOPT_NOP; } ptr += (len + 3) >> 2; } smc_options_write(ptr, &options); mptcp_options_write(th, ptr, tp, opts); } static void smc_set_option(const struct tcp_sock *tp, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } } #endif } static void smc_set_option_cond(const struct tcp_sock *tp, const struct inet_request_sock *ireq, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc && ireq->smc_ok) { if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } } #endif } static void mptcp_set_option_cond(const struct request_sock *req, struct tcp_out_options *opts, unsigned int *remaining) { if (rsk_is_mptcp(req)) { unsigned int size; if (mptcp_synack_options(req, &size, &opts->mptcp)) { if (*remaining >= size) { opts->options |= OPTION_MPTCP; *remaining -= size; } } } } /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int remaining = MAX_TCP_OPTION_SPACE; struct tcp_fastopen_request *fastopen = tp->fastopen_req; bool timestamps; /* Better than switch (key.type) as it has static branches */ if (tcp_key_is_md5(key)) { timestamps = false; opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; } else { timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps); if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; remaining -= tcp_ao_len_aligned(key->ao_key); } } /* We always get an MSS option. The option bytes which will be seen in * normal data packets should timestamps be used, must be in the MSS * advertised. But we subtract them from tp->mss_cache so that * calculations in tcp_sendmsg are simpler etc. So account for this * fact here if necessary. If we don't do this correctly, as a * receiver we won't recognize data packets as being full sized when we * should, and thus we won't abide by the delayed ACK rules correctly. * SACKs don't matter, we never delay an ACK when we have any of those * going out. */ opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; if (likely(timestamps)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; } if (fastopen && fastopen->cookie.len >= 0) { u32 need = fastopen->cookie.len; need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE : TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = &fastopen->cookie; remaining -= need; tp->syn_fastopen = 1; tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0; } } smc_set_option(tp, opts, &remaining); if (sk_is_mptcp(sk)) { unsigned int size; if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { if (remaining >= size) { opts->options |= OPTION_MPTCP; remaining -= size; } } } bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ static unsigned int tcp_synack_options(const struct sock *sk, struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, const struct tcp_key *key, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; /* We can't fit any SACK blocks in a packet with MD5 + TS * options. There was discussion about disabling SACK * rather than TS in order to fit in better with old, * buggy kernels, but that was deemed to be unnecessary. */ if (synack_type != TCP_SYNACK_COOKIE) ireq->tstamp_ok &= !ireq->sack_ok; } else if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; remaining -= tcp_ao_len_aligned(key->ao_key); ireq->tstamp_ok &= !ireq->sack_ok; } /* We always send an MSS option. */ opts->mss = mss; remaining -= TCPOLEN_MSS_ALIGNED; if (likely(ireq->wscale_ok)) { opts->ws = ireq->rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) + tcp_rsk(req)->ts_off; if (!tcp_rsk(req)->snt_tsval_first) { if (!opts->tsval) opts->tsval = ~0U; tcp_rsk(req)->snt_tsval_first = opts->tsval; } WRITE_ONCE(tcp_rsk(req)->snt_tsval_last, opts->tsval); opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!ireq->tstamp_ok)) remaining -= TCPOLEN_SACKPERM_ALIGNED; } if (foc != NULL && foc->len >= 0) { u32 need = foc->len; need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE : TCPOLEN_FASTOPEN_BASE; need = (need + 3) & ~3U; /* Align to 32 bits */ if (remaining >= need) { opts->options |= OPTION_FAST_OPEN_COOKIE; opts->fastopen_cookie = foc; remaining -= need; } } mptcp_set_option_cond(req, opts, &remaining); smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; } /* Compute TCP options for ESTABLISHED sockets. This is not the * final wire format yet. */ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_key *key) { struct tcp_sock *tp = tcp_sk(sk); unsigned int size = 0; unsigned int eff_sacks; opts->options = 0; /* Better than switch (key.type) as it has static branches */ if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; size += TCPOLEN_MD5SIG_ALIGNED; } else if (tcp_key_is_ao(key)) { opts->options |= OPTION_AO; size += tcp_ao_len_aligned(key->ao_key); } if (likely(tp->rx_opt.tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset : 0; opts->tsecr = tp->rx_opt.ts_recent; size += TCPOLEN_TSTAMP_ALIGNED; } /* MPTCP options have precedence over SACK for the limited TCP * option space because a MPTCP connection would be forced to * fall back to regular TCP if a required multipath option is * missing. SACK still gets a chance to use whatever space is * left. */ if (sk_is_mptcp(sk)) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; unsigned int opt_size = 0; if (mptcp_established_options(sk, skb, &opt_size, remaining, &opts->mptcp)) { opts->options |= OPTION_MPTCP; size += opt_size; } } eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + TCPOLEN_SACK_PERBLOCK)) return size; opts->num_sack_blocks = min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); size += TCPOLEN_SACK_BASE_ALIGNED + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); size = MAX_TCP_OPTION_SPACE - remaining; } return size; } /* TCP SMALL QUEUES (TSQ) * * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev) * to reduce RTT and bufferbloat. * We do this using a special skb destructor (tcp_wfree). * * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb * needs to be reallocated in a driver. * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc * * Since transmit from skb destructor is forbidden, we use a tasklet * to process all sockets that eventually need to send more skbs. * We use one tasklet per cpu, with its own queue of sockets. */ struct tsq_tasklet { struct tasklet_struct tasklet; struct list_head head; /* queue of tcp sockets */ }; static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); static void tcp_tsq_write(struct sock *sk) { if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) { struct tcp_sock *tp = tcp_sk(sk); if (tp->lost_out > tp->retrans_out && tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) { tcp_mstamp_refresh(tp); tcp_xmit_retransmit_queue(sk); } tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, 0, GFP_ATOMIC); } } static void tcp_tsq_handler(struct sock *sk) { bh_lock_sock(sk); if (!sock_owned_by_user(sk)) tcp_tsq_write(sk); else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); bh_unlock_sock(sk); } /* * One tasklet per cpu tries to send more skbs. * We run in tasklet context but need to disable irqs when * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ static void tcp_tasklet_func(struct tasklet_struct *t) { struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet); LIST_HEAD(list); unsigned long flags; struct list_head *q, *n; struct tcp_sock *tp; struct sock *sk; local_irq_save(flags); list_splice_init(&tsq->head, &list); local_irq_restore(flags); list_for_each_safe(q, n, &list) { tp = list_entry(q, struct tcp_sock, tsq_node); list_del(&tp->tsq_node); sk = (struct sock *)tp; smp_mb__before_atomic(); clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); tcp_tsq_handler(sk); sk_free(sk); } } #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ TCPF_WRITE_TIMER_DEFERRED | \ TCPF_DELACK_TIMER_DEFERRED | \ TCPF_MTU_REDUCED_DEFERRED | \ TCPF_ACK_DEFERRED) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket * * called from release_sock() to perform protocol dependent * actions before socket release. */ void tcp_release_cb(struct sock *sk) { unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags); unsigned long nflags; /* perform an atomic operation only if at least one flag is set */ do { if (!(flags & TCP_DEFERRED_ALL)) return; nflags = flags & ~TCP_DEFERRED_ALL; } while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags)); if (flags & TCPF_TSQ_DEFERRED) { tcp_tsq_write(sk); __sock_put(sk); } if (flags & TCPF_WRITE_TIMER_DEFERRED) { tcp_write_timer_handler(sk); __sock_put(sk); } if (flags & TCPF_DELACK_TIMER_DEFERRED) { tcp_delack_timer_handler(sk); __sock_put(sk); } if (flags & TCPF_MTU_REDUCED_DEFERRED) { inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); __sock_put(sk); } if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk)) tcp_send_ack(sk); } EXPORT_IPV6_MOD(tcp_release_cb); void __init tcp_tasklet_init(void) { int i; for_each_possible_cpu(i) { struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); INIT_LIST_HEAD(&tsq->head); tasklet_setup(&tsq->tasklet, tcp_tasklet_func); } } /* * Write buffer destructor automatically called from kfree_skb. * We can't xmit new skbs from this context, as we might already * hold qdisc lock. */ void tcp_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); unsigned long flags, nval, oval; struct tsq_tasklet *tsq; bool empty; /* Keep one reference on sk_wmem_alloc. * Will be released by sk_free() from here or tcp_tasklet_func() */ WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc)); /* If this softirq is serviced by ksoftirqd, we are likely under stress. * Wait until our queues (qdisc + devices) are drained. * This gives : * - less callbacks to tcp_write_xmit(), reducing stress (batches) * - chance for incoming ACK (processed by another cpu maybe) * to migrate this flow (skb->ooo_okay will be eventually set) */ if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) goto out; oval = smp_load_acquire(&sk->sk_tsq_flags); do { if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) goto out; nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval)); /* queue this socket to tasklet queue */ local_irq_save(flags); tsq = this_cpu_ptr(&tsq_tasklet); empty = list_empty(&tsq->head); list_add(&tp->tsq_node, &tsq->head); if (empty) tasklet_schedule(&tsq->tasklet); local_irq_restore(flags); return; out: sk_free(sk); } /* Note: Called under soft irq. * We can call TCP stack right away, unless socket is owned by user. */ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) { struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer); struct sock *sk = (struct sock *)tp; tcp_tsq_handler(sk); sock_put(sk); return HRTIMER_NORESTART; } static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, u64 prior_wstamp) { struct tcp_sock *tp = tcp_sk(sk); if (sk->sk_pacing_status != SK_PACING_NONE) { unsigned long rate = READ_ONCE(sk->sk_pacing_rate); /* Original sch_fq does not pace first 10 MSS * Note that tp->data_segs_out overflows after 2^32 packets, * this is a minor annoyance. */ if (rate != ~0UL && rate && tp->data_segs_out >= 10) { u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate); u64 credit = tp->tcp_wstamp_ns - prior_wstamp; /* take into account OS jitter */ len_ns -= min_t(u64, len_ns / 2, credit); tp->tcp_wstamp_ns += len_ns; } } list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. * All SKB's seen here are completely headerless. It is our * job to build the TCP header, and pass the packet down to * IP so it can do the same plus pass the packet off to the * device. * * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; struct tcp_out_options opts; unsigned int tcp_options_size, tcp_header_size; struct sk_buff *oskb = NULL; struct tcp_key key; struct tcphdr *th; u64 prior_wstamp; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); if (clone_it) { oskb = skb; tcp_skb_tsorted_save(oskb) { if (unlikely(skb_cloned(oskb))) skb = pskb_copy(oskb, gfp_mask); else skb = skb_clone(oskb, gfp_mask); } tcp_skb_tsorted_restore(oskb); if (unlikely(!skb)) return -ENOBUFS; /* retransmit skbs might have a non zero value in skb->dev * because skb->dev is aliased with skb->rbnode.rb_left */ skb->dev = NULL; } inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); tcp_get_current_key(sk, &key); if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { tcp_options_size = tcp_syn_options(sk, skb, &opts, &key); } else { tcp_options_size = tcp_established_options(sk, skb, &opts, &key); /* Force a PSH flag on all (GSO) packets to expedite GRO flush * at receiver : This slightly improve GRO performance. * Note that we do not force the PSH flag for non GSO packets, * because they might be sent under high congestion events, * and in this case it is better to delay the delivery of 1-MSS * packets and thus the corresponding ACK packet that would * release the following packet. */ if (tcp_skb_pcount(skb) > 1) tcb->tcp_flags |= TCPHDR_PSH; } tcp_header_size = tcp_options_size + sizeof(struct tcphdr); /* We set skb->ooo_okay to one if this packet can select * a different TX queue than prior packets of this flow, * to avoid self inflicted reorders. * The 'other' queue decision is based on current cpu number * if XPS is enabled, or sk->sk_txhash otherwise. * We can switch to another (and better) queue if: * 1) No packet with payload is in qdisc/device queues. * Delays in TX completion can defeat the test * even if packets were already sent. * 2) Or rtx queue is empty. * This mitigates above case if ACK packets for * all prior packets were already processed. */ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) || tcp_rtx_queue_empty(sk); /* If we had to use memory reserve to allocate this skb, * this might cause drops if packet is looped back : * Other socket might not have SOCK_MEMALLOC. * Packets not looped back do not care about pfmemalloc. */ skb->pfmemalloc = 0; skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_orphan(skb); skb->sk = sk; skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm)); /* Build TCP header and checksum it. */ th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | (tcb->tcp_flags & TCPHDR_FLAGS_MASK)); th->check = 0; th->urg_ptr = 0; /* The urg_mode check is necessary during a below snd_una win probe */ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { if (before(tp->snd_up, tcb->seq + 0x10000)) { th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { th->urg_ptr = htons(0xFFFF); th->urg = 1; } } skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { th->window = htons(tcp_select_window(sk)); tcp_ecn_send(sk, skb, th, tcp_header_size); } else { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ th->window = htons(min(tp->rcv_wnd, 65535U)); } tcp_options_write(th, tp, NULL, &opts, &key); if (tcp_key_is_md5(&key)) { #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ sk_gso_disable(sk); tp->af_specific->calc_md5_hash(opts.hash_location, key.md5_key, sk, skb); #endif } else if (tcp_key_is_ao(&key)) { int err; err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th, opts.hash_location); if (err) { kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); return -ENOMEM; } } /* BPF prog is the last one writing header option */ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, tcp_v6_send_check, tcp_v4_send_check, sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); tp->bytes_sent += skb->len - tcp_header_size; } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); tp->segs_out += tcp_skb_pcount(skb); skb_set_hash_from_sk(skb, sk); /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */ /* Cleanup our debris for IP stacks */ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); tcp_add_tx_delay(skb, tp); err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit, inet6_csk_xmit, ip_queue_xmit, sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { tcp_enter_cwr(sk); err = net_xmit_eval(err); } if (!err && oskb) { tcp_update_skb_after_send(sk, oskb, prior_wstamp); tcp_rate_skb_sent(sk, oskb); } return err; } static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, tcp_sk(sk)->rcv_nxt); } /* This routine just queues the buffer for sending. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. */ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* Advance write_seq and place onto the write_queue. */ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); } /* Initialize TSO segments for a packet. */ static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs; if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ TCP_SKB_CB(skb)->tcp_gso_size = 0; tcp_skb_pcount_set(skb, 1); return 1; } TCP_SKB_CB(skb)->tcp_gso_size = mss_now; tso_segs = DIV_ROUND_UP(skb->len, mss_now); tcp_skb_pcount_set(skb, tso_segs); return tso_segs; } /* Pcount in the middle of the write queue got changed, we need to do various * tweaks to fix counters */ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr) { struct tcp_sock *tp = tcp_sk(sk); tp->packets_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) tp->retrans_out -= decr; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) tp->lost_out -= decr; /* Reno case is special. Sigh... */ if (tcp_is_reno(tp) && decr > 0) tp->sacked_out -= min_t(u32, tp->sacked_out, decr); if (tp->lost_skb_hint && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) tp->lost_cnt_hint -= decr; tcp_verify_left_out(tp); } static bool tcp_has_tx_tstamp(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->txstamp_ack || (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP); } static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) { struct skb_shared_info *shinfo = skb_shinfo(skb); if (unlikely(tcp_has_tx_tstamp(skb)) && !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) { struct skb_shared_info *shinfo2 = skb_shinfo(skb2); u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tx_flags &= ~tsflags; shinfo2->tx_flags |= tsflags; swap(shinfo->tskey, shinfo2->tskey); TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack; TCP_SKB_CB(skb)->txstamp_ack = 0; } } static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) { TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor; TCP_SKB_CB(skb)->eor = 0; } /* Insert buff after skb on the write or rtx queue of sk. */ static void tcp_insert_write_queue_after(struct sk_buff *skb, struct sk_buff *buff, struct sock *sk, enum tcp_queue tcp_queue) { if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) __skb_queue_after(&sk->sk_write_queue, skb, buff); else tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); } /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int old_factor; long limit; u16 flags; int nlen; if (WARN_ON(len > skb->len)) return -EINVAL; DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb. * We need some allowance to not penalize applications setting small * SO_SNDBUF values. * Also allow first and last skb in retransmit queue to be split. */ limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE); if (unlikely((sk->sk_wmem_queued >> 1) > limit && tcp_queue != TCP_FRAG_IN_WRITE_QUEUE && skb != tcp_rtx_queue_head(sk) && skb != tcp_rtx_queue_tail(sk))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); return -ENOMEM; } if (skb_unclone_keeptruesize(skb, gfp)) return -ENOMEM; /* Get a new skb... force flag on. */ buff = tcp_stream_alloc_skb(sk, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); nlen = skb->len - len; buff->truesize += nlen; skb->truesize -= nlen; /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; tcp_skb_fragment_eor(skb, buff); skb_split(skb, buff, len); skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC); tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); /* Update delivered info for the new segment */ TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx; /* If this packet has been sent out already, we must * adjust the various packet counters. */ if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) { int diff = old_factor - tcp_skb_pcount(skb) - tcp_skb_pcount(buff); if (diff) tcp_adjust_pcount(sk, skb, diff); } /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE) list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } /* This is similar to __pskb_pull_tail(). The difference is that pulled * data is not copied, but immediately discarded. */ static int __pskb_trim_head(struct sk_buff *skb, int len) { struct skb_shared_info *shinfo; int i, k, eat; DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb)); eat = len; k = 0; shinfo = skb_shinfo(skb); for (i = 0; i < shinfo->nr_frags; i++) { int size = skb_frag_size(&shinfo->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { shinfo->frags[k] = shinfo->frags[i]; if (eat) { skb_frag_off_add(&shinfo->frags[k], eat); skb_frag_size_sub(&shinfo->frags[k], eat); eat = 0; } k++; } } shinfo->nr_frags = k; skb->data_len -= len; skb->len = skb->data_len; return len; } /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { u32 delta_truesize; if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; delta_truesize = __pskb_trim_head(skb, len); TCP_SKB_CB(skb)->seq += len; skb->truesize -= delta_truesize; sk_wmem_queued_add(sk, -delta_truesize); if (!skb_zcopy_pure(skb)) sk_mem_uncharge(sk, delta_truesize); /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); return 0; } /* Calculate MSS not accounting any TCP options. */ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ mss_now = max(mss_now, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss)); return mss_now; } /* Calculate MSS. Not accounting for SACKs here. */ int tcp_mtu_to_mss(struct sock *sk, int pmtu) { /* Subtract TCP options size, not including SACKs */ return __tcp_mtu_to_mss(sk, pmtu) - (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); } EXPORT_IPV6_MOD(tcp_mtu_to_mss); /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) { const struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); return mss + tp->tcp_header_len + icsk->icsk_ext_hdr_len + icsk->icsk_af_ops->net_header_len; } EXPORT_SYMBOL(tcp_mss_to_mtu); /* MTU probing init per socket */ void tcp_mtup_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss)); icsk->icsk_mtup.probe_size = 0; if (icsk->icsk_mtup.enabled) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts for TCP options, but includes only bare TCP header. tp->rx_opt.mss_clamp is mss negotiated at connection setup. It is minimum of user_mss and mss received with SYN. It also does not include TCP options. inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, taking into account current pmtu, but never exceeds tp->rx_opt.mss_clamp. NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache are READ ONLY outside this function. --ANK (980731) */ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; if (icsk->icsk_mtup.search_high > pmtu) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); mss_now = tcp_bound_to_half_wnd(tp, mss_now); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; if (icsk->icsk_mtup.enabled) mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); tp->mss_cache = mss_now; return mss_now; } EXPORT_IPV6_MOD(tcp_sync_mss); /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. */ unsigned int tcp_current_mss(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct dst_entry *dst = __sk_dst_get(sk); u32 mss_now; unsigned int header_len; struct tcp_out_options opts; struct tcp_key key; mss_now = tp->mss_cache; if (dst) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } tcp_get_current_key(sk, &key); header_len = tcp_established_options(sk, NULL, &opts, &key) + sizeof(struct tcphdr); /* The mss_cache is sized based on tp->tcp_header_len, which assumes * some common options. If this is an odd packet (because we have SACK * blocks etc) then our calculated header_len will be different, and * we have to adjust mss_now correspondingly */ if (header_len != tp->tcp_header_len) { int delta = (int) header_len - tp->tcp_header_len; mss_now -= delta; } return mss_now; } /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, * and if application hit its sndbuf limit recently. */ static void tcp_cwnd_application_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { /* Limited by application or receiver window. */ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 win_used = max(tp->snd_cwnd_used, init_win); if (win_used < tcp_snd_cwnd(tp)) { tp->snd_ssthresh = tcp_current_ssthresh(sk); tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1); } tp->snd_cwnd_used = 0; } tp->snd_cwnd_stamp = tcp_jiffies32; } static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); /* Track the strongest available signal of the degree to which the cwnd * is fully utilized. If cwnd-limited then remember that fact for the * current window. If not cwnd-limited then track the maximum number of * outstanding packets in the current window. (If cwnd-limited then we * chose to not update tp->max_packets_out to avoid an extra else * clause with no functional impact.) */ if (!before(tp->snd_una, tp->cwnd_usage_seq) || is_cwnd_limited || (!tp->is_cwnd_limited && tp->packets_out > tp->max_packets_out)) { tp->is_cwnd_limited = is_cwnd_limited; tp->max_packets_out = tp->packets_out; tp->cwnd_usage_seq = tp->snd_nxt; } if (tcp_is_cwnd_limited(sk)) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_jiffies32; } else { /* Network starves. */ if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) && (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && !ca_ops->cong_control) tcp_cwnd_application_limited(sk); /* The following conditions together indicate the starvation * is caused by insufficient sender buffer: * 1) just sent some data (see tcp_write_xmit) * 2) not cwnd limited (this else condition) * 3) no more data to send (tcp_write_queue_empty()) * 4) application is hitting buffer limit (SOCK_NOSPACE) */ if (tcp_write_queue_empty(sk) && sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); } } /* Minshall's variant of the Nagle send check. */ static bool tcp_minshall_check(const struct tcp_sock *tp) { return after(tp->snd_sml, tp->snd_una) && !after(tp->snd_sml, tp->snd_nxt); } /* Update snd_sml if this skb is under mss * Note that a TSO packet might end with a sub-mss segment * The test is really : * if ((skb->len % mss) != 0) * tp->snd_sml = TCP_SKB_CB(skb)->end_seq; * But we can avoid doing the divide again given we already have * skb_pcount = skb->len / mss_now */ static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now, const struct sk_buff *skb) { if (skb->len < tcp_skb_pcount(skb) * mss_now) tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } /* Return false, if packet can be sent now without violation Nagle's rules: * 1. It is full sized. (provided by caller in %partial bool) * 2. Or it contains FIN. (already checked by caller) * 3. Or TCP_CORK is not set, and TCP_NODELAY is set. * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, int nonagle) { return partial && ((nonagle & TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } /* Return how many segs we'd like on a TSO packet, * depending on current pacing rate, and how close the peer is. * * Rationale is: * - For close peers, we rather send bigger packets to reduce * cpu costs, because occasional losses will be repaired fast. * - For long distance/rtt flows, we would like to get ACK clocking * with 1 ACK per ms. * * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting * in bigger TSO bursts. We we cut the RTT-based allowance in half * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance * is below 1500 bytes after 6 * ~500 usec = 3ms. */ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, int min_tso_segs) { unsigned long bytes; u32 r; bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log); if (r < BITS_PER_TYPE(sk->sk_gso_max_size)) bytes += sk->sk_gso_max_size >> r; bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size); return max_t(u32, bytes / mss_now, min_tso_segs); } /* Return the number of segments we want in the skb we are transmitting. * See if congestion control module wants to decide; otherwise, autosize. */ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; u32 min_tso, tso_segs; min_tso = ca_ops->min_tso_segs ? ca_ops->min_tso_segs(sk) : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); return min_t(u32, tso_segs, sk->sk_gso_max_segs); } /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, unsigned int mss_now, unsigned int max_segs, int nonagle) { const struct tcp_sock *tp = tcp_sk(sk); u32 partial, needed, window, max_len; window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; max_len = mss_now * max_segs; if (likely(max_len <= window && skb != tcp_write_queue_tail(sk))) return max_len; needed = min(skb->len, window); if (max_len <= needed) return max_len; partial = needed % mss_now; /* If last segment is not a full MSS, check if Nagle rules allow us * to include this last segment in this skb. * Otherwise, we'll split the skb at last MSS boundary */ if (tcp_nagle_check(partial != 0, tp, nonagle)) return needed - partial; return needed; } /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ static u32 tcp_cwnd_test(const struct tcp_sock *tp) { u32 in_flight, cwnd, halfcwnd; in_flight = tcp_packets_in_flight(tp); cwnd = tcp_snd_cwnd(tp); if (in_flight >= cwnd) return 0; /* For better scheduling, ensure we have at least * 2 GSO packets in flight. */ halfcwnd = max(cwnd >> 1, 1U); return min(halfcwnd, cwnd - in_flight); } /* Initialize TSO state of a skb. * This must be invoked the first time we consider transmitting * SKB onto the wire. */ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) return tcp_set_skb_tso_segs(skb, mss_now); return tso_segs; } /* Return true if the Nagle test allows this packet to be * sent now. */ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss, int nonagle) { /* Nagle rule does not apply to frames, which sit in the middle of the * write_queue (they have no chances to get new data). * * This is implemented in the callers, where they modify the 'nonagle' * argument based upon the location of SKB in the send queue. */ if (nonagle & TCP_NAGLE_PUSH) return true; /* Don't use the nagle rule for urgent data (or for the final FIN). */ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) return true; if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle)) return true; return false; } /* Does at least the first segment of SKB fit into the send window? */ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (skb->len > cur_mss) end_seq = TCP_SKB_CB(skb)->seq + cur_mss; return !after(end_seq, tcp_wnd_end(tp)); } /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet * which is put after SKB on the list. It is very much like * tcp_fragment() except that it may make several kinds of assumptions * in order to speed up the splitting operation. In particular, we * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now, gfp_t gfp) { int nlen = skb->len - len; struct sk_buff *buff; u16 flags; /* All of a TSO frame must be composed of paged data. */ DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len); buff = tcp_stream_alloc_skb(sk, gfp, true); if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); mptcp_skb_ext_copy(buff, skb); sk_wmem_queued_add(sk, buff->truesize); sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; tcp_skb_fragment_eor(skb, buff); skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); /* Link BUFF into the send queue. */ __skb_header_release(buff); tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE); return 0; } /* Try to defer sending, if possible, in order to minimize the amount * of TSO splitting we do. View it as a kind of TSO Nagle test. * * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, bool *is_cwnd_limited, bool *is_rwnd_limited, u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *head; int win_divisor; s64 delta; if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer * only if the last write was recent (1 ms). * Note that tp->tcp_wstamp_ns can be in the future if we have * packets waiting in a qdisc or device for EDT delivery. */ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC; if (delta > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); BUG_ON(tcp_skb_pcount(skb) <= 1); BUG_ON(tcp_snd_cwnd(tp) <= in_flight); send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* From in_flight test above, we know that cwnd > in_flight. */ cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache; limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ if (limit >= max_segs * tp->mss_cache) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) goto send_now; win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache); /* If at least some fraction of a window is available, * just use it. */ chunk /= win_divisor; if (limit >= chunk) goto send_now; } else { /* Different approach, try not to defer past a single * ACK. Receiver should ACK every other full sized * frame, so if we have space for more than 3 frames * then send now. */ if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache) goto send_now; } /* TODO : use tsorted_sent_queue ? */ head = tcp_rtx_queue_head(sk); if (!head) goto send_now; delta = tp->tcp_clock_cache - head->tstamp; /* If next ACK is likely to come too late (half srtt), do not defer */ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) goto send_now; /* Ok, it looks like it is advisable to defer. * Three cases are tracked : * 1) We are cwnd-limited * 2) We are rwnd-limited * 3) We are application limited. */ if (cong_win < send_win) { if (cong_win <= skb->len) { *is_cwnd_limited = true; return true; } } else { if (send_win <= skb->len) { *is_rwnd_limited = true; return true; } } /* If this packet won't get more data, do not wait. */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || TCP_SKB_CB(skb)->eor) goto send_now; return true; send_now: return false; } static inline void tcp_mtu_check_reprobe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); u32 interval; s32 delta; interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval); delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp; if (unlikely(delta >= interval * HZ)) { int mss = tcp_current_mss(sk); /* Update current search range */ icsk->icsk_mtup.probe_size = 0; icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); /* Update probe time stamp */ icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } } static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) { struct sk_buff *skb, *next; skb = tcp_send_head(sk); tcp_for_write_queue_from_safe(skb, next, sk) { if (len <= skb->len) break; if (tcp_has_tx_tstamp(skb) || !tcp_skb_can_collapse(skb, next)) return false; len -= skb->len; } return true; } static int tcp_clone_payload(struct sock *sk, struct sk_buff *to, int probe_size) { skb_frag_t *lastfrag = NULL, *fragto = skb_shinfo(to)->frags; int i, todo, len = 0, nr_frags = 0; const struct sk_buff *skb; if (!sk_wmem_schedule(sk, to->truesize + probe_size)) return -ENOMEM; skb_queue_walk(&sk->sk_write_queue, skb) { const skb_frag_t *fragfrom = skb_shinfo(skb)->frags; if (skb_headlen(skb)) return -EINVAL; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) { if (len >= probe_size) goto commit; todo = min_t(int, skb_frag_size(fragfrom), probe_size - len); len += todo; if (lastfrag && skb_frag_page(fragfrom) == skb_frag_page(lastfrag) && skb_frag_off(fragfrom) == skb_frag_off(lastfrag) + skb_frag_size(lastfrag)) { skb_frag_size_add(lastfrag, todo); continue; } if (unlikely(nr_frags == MAX_SKB_FRAGS)) return -E2BIG; skb_frag_page_copy(fragto, fragfrom); skb_frag_off_copy(fragto, fragfrom); skb_frag_size_set(fragto, todo); nr_frags++; lastfrag = fragto++; } } commit: WARN_ON_ONCE(len != probe_size); for (i = 0; i < nr_frags; i++) skb_frag_ref(to, i); skb_shinfo(to)->nr_frags = nr_frags; to->truesize += probe_size; to->len += probe_size; to->data_len += probe_size; __skb_header_release(to); return 0; } /* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if * all its payload was moved to another one (dst). * Make sure to transfer tcp_flags, eor, and tstamp. */ static void tcp_eat_one_skb(struct sock *sk, struct sk_buff *dst, struct sk_buff *src) { TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags; TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor; tcp_skb_collapse_tstamp(dst, src); tcp_unlink_write_queue(src, sk); tcp_wmem_free_skb(sk, src); } /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing * changes resulting in larger path MTUs. * * Returns 0 if we should wait to probe (no cwnd available), * 1 if a probe was sent, * -1 otherwise */ static int tcp_mtu_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *nskb, *next; struct net *net = sock_net(sk); int probe_size; int size_needed; int copy, len; int mss_now; int interval; /* Not currently probing/verifying, * not in recovery, * have enough cwnd, and * not SACKing (the variable headers throw things off) */ if (likely(!icsk->icsk_mtup.enabled || icsk->icsk_mtup.probe_size || inet_csk(sk)->icsk_ca_state != TCP_CA_Open || tcp_snd_cwnd(tp) < 11 || tp->rx_opt.num_sacks || tp->rx_opt.dsack)) return -1; /* Use binary search for probe_size between tcp_mss_base, * and current mss_clamp. if (search_high - search_low) * smaller than a threshold, backoff from probing. */ mss_now = tcp_current_mss(sk); probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high + icsk->icsk_mtup.search_low) >> 1); size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; /* When misfortune happens, we are reprobing actively, * and then reprobe timer has expired. We stick with current * probing process by not resetting search range to its orignal. */ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) { /* Check whether enough time has elaplased for * another round of probing. */ tcp_mtu_check_reprobe(sk); return -1; } /* Have enough data in the send queue to probe? */ if (tp->write_seq - tp->snd_nxt < size_needed) return -1; if (tp->snd_wnd < size_needed) return -1; if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) return 0; /* Do we need to wait to drain cwnd? With none in flight, don't stall */ if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) { if (!tcp_packets_in_flight(tp)) return -1; else return 0; } if (!tcp_can_coalesce_send_queue_head(sk, probe_size)) return -1; /* We're allowed to probe. Build it now. */ nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, false); if (!nskb) return -1; /* build the payload, and be prepared to abort if this fails. */ if (tcp_clone_payload(sk, nskb, probe_size)) { tcp_skb_tsorted_anchor_cleanup(nskb); consume_skb(nskb); return -1; } sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); skb_copy_decrypted(nskb, skb); mptcp_skb_ext_copy(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; tcp_insert_write_queue_before(nskb, skb, sk); tcp_highest_sack_replace(sk, skb, nskb); len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); if (skb->len <= copy) { tcp_eat_one_skb(sk, nskb, skb); } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); __pskb_trim_head(skb, copy); tcp_set_skb_tso_segs(skb, mss_now); TCP_SKB_CB(skb)->seq += copy; } len += copy; if (len >= probe_size) break; } tcp_init_tso_segs(nskb, nskb->len); /* We're ready to send. If this fails, the probe will * be resegmented into mss-sized pieces by tcp_write_xmit(). */ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1); tcp_event_new_data_sent(sk, nskb); icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; return 1; } return -1; } static bool tcp_pacing_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_needs_internal_pacing(sk)) return false; if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache) return false; if (!hrtimer_is_queued(&tp->pacing_timer)) { hrtimer_start(&tp->pacing_timer, ns_to_ktime(tp->tcp_wstamp_ns), HRTIMER_MODE_ABS_PINNED_SOFT); sock_hold(sk); } return true; } static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk) { const struct rb_node *node = sk->tcp_rtx_queue.rb_node; /* No skb in the rtx queue. */ if (!node) return true; /* Only one skb in rtx queue. */ return !node->rb_left && !node->rb_right; } /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * (These limits are doubled for retransmits) * This allows for : * - better RTT estimation and ACK scheduling * - faster recovery * - high rates * Alas, some drivers / subsystems require a fair amount * of queued bytes to ensure line rate. * One example is wifi aggregation (802.11 AMPDU) */ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, unsigned int factor) { unsigned long limit; limit = max_t(unsigned long, 2 * skb->truesize, READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift)); if (sk->sk_pacing_status == SK_PACING_NONE) limit = min_t(unsigned long, limit, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); limit <<= factor; if (static_branch_unlikely(&tcp_tx_delay_enabled) && tcp_sk(sk)->tcp_tx_delay) { u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) * tcp_sk(sk)->tcp_tx_delay; /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we * approximate our needs assuming an ~100% skb->truesize overhead. * USEC_PER_SEC is approximated by 2^20. * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift. */ extra_bytes >>= (20 - 1); limit += extra_bytes; } if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty or has one skb. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ if (tcp_rtx_queue_empty_or_single_skb(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); /* It is possible TX completion already happened * before we set TSQ_THROTTLED, so we must * test again the condition. */ smp_mb__after_atomic(); if (refcount_read(&sk->sk_wmem_alloc) > limit) return true; } return false; } static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { const u32 now = tcp_jiffies32; enum tcp_chrono old = tp->chrono_type; if (old > TCP_CHRONO_UNSPEC) tp->chrono_stat[old - 1] += now - tp->chrono_start; tp->chrono_start = now; tp->chrono_type = new; } void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) { struct tcp_sock *tp = tcp_sk(sk); /* If there are multiple conditions worthy of tracking in a * chronograph then the highest priority enum takes precedence * over the other conditions. So that if something "more interesting" * starts happening, stop the previous chrono and start a new one. */ if (type > tp->chrono_type) tcp_chrono_set(tp, type); } void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) { struct tcp_sock *tp = tcp_sk(sk); /* There are multiple conditions worthy of tracking in a * chronograph, so that the highest priority enum takes * precedence over the other conditions (see tcp_chrono_start). * If a condition stops, we only stop chrono tracking if * it's the "most interesting" or current chrono we are * tracking and starts busy chrono if we have pending data. */ if (tcp_rtx_and_write_queues_empty(sk)) tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); else if (type == tp->chrono_type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); } /* First skb in the write queue is smaller than ideal packet size. * Check if we can move payload from the second skb in the queue. */ static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount) { struct sk_buff *next_skb = skb->next; unsigned int nlen; if (tcp_skb_is_last(sk, skb)) return; if (!tcp_skb_can_collapse(skb, next_skb)) return; nlen = min_t(u32, amount, next_skb->len); if (!nlen || !skb_shift(skb, next_skb, nlen)) return; TCP_SKB_CB(skb)->end_seq += nlen; TCP_SKB_CB(next_skb)->seq += nlen; if (!next_skb->len) { /* In case FIN is set, we need to update end_seq */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; tcp_eat_one_skb(sk, skb, next_skb); } } /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. * * LARGESEND note: !tcp_urg_mode is overkill, only frames between * snd_up-64k-mss .. snd_up cannot be large. However, taking into * account rare use of URG, this is not a big flaw. * * Send at most one packet when push_one > 0. Temporarily ignore * cwnd limit to force at most one packet out when push_one == 2. * Returns true, if no segments are in flight and we have queued segments, * but cannot send anything now because of SWS or another problem. */ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int tso_segs, sent_pkts; u32 cwnd_quota, max_segs; int result; bool is_cwnd_limited = false, is_rwnd_limited = false; sent_pkts = 0; tcp_mstamp_refresh(tp); if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); if (!result) { return false; } else if (result > 0) { sent_pkts = 1; } } max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; int missing_bytes; if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ tp->tcp_wstamp_ns = tp->tcp_clock_cache; skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ } if (tcp_pacing_check(sk)) break; cwnd_quota = tcp_cwnd_test(tp); if (!cwnd_quota) { if (push_one == 2) /* Force out a loss probe pkt. */ cwnd_quota = 1; else break; } cwnd_quota = min(cwnd_quota, max_segs); missing_bytes = cwnd_quota * mss_now - skb->len; if (missing_bytes > 0) tcp_grow_skb(sk, skb, missing_bytes); tso_segs = tcp_set_skb_tso_segs(skb, mss_now); if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { is_rwnd_limited = true; break; } if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)))) break; } else { if (!push_one && tcp_tso_should_defer(sk, skb, &is_cwnd_limited, &is_rwnd_limited, max_segs)) break; } limit = mss_now; if (tso_segs > 1 && !tcp_urg_mode(tp)) limit = tcp_mss_split_point(sk, skb, mss_now, cwnd_quota, nonagle); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; if (tcp_small_queue_check(sk, skb, 0)) break; /* Argh, we hit an empty skb(), presumably a thread * is sleeping in sendmsg()/sk_stream_wait_memory(). * We do not want to send a pure-ack packet and have * a strange looking rtx queue with empty packet(s). */ if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) break; if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; repair: /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts += tcp_skb_pcount(skb); if (push_one) break; } if (is_rwnd_limited) tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED); else tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp)); if (likely(sent_pkts || is_cwnd_limited)) tcp_cwnd_validate(sk, is_cwnd_limited); if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk, false); return false; } return !tp->packets_out && !tcp_write_queue_empty(sk); } bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 timeout, timeout_us, rto_delta_us; int early_retrans; /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ if (rcu_access_pointer(tp->fastopen_rsk)) return false; early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans); /* Schedule a loss probe in 2*RTT for SACK capable connections * not in loss recovery, that are either limited by cwnd or application. */ if ((early_retrans != 3 && early_retrans != 4) || !tp->packets_out || !tcp_is_sack(tp) || (icsk->icsk_ca_state != TCP_CA_Open && icsk->icsk_ca_state != TCP_CA_CWR)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account * for delayed ack when there's one outstanding packet. If no RTT * sample is available then probe after TCP_TIMEOUT_INIT. */ if (tp->srtt_us) { timeout_us = tp->srtt_us >> 2; if (tp->packets_out == 1) timeout_us += tcp_rto_min_us(sk); else timeout_us += TCP_TIMEOUT_MIN_US; timeout = usecs_to_jiffies(timeout_us); } else { timeout = TCP_TIMEOUT_INIT; } /* If the RTO formula yields an earlier time, then use that time. */ rto_delta_us = advancing_rto ? jiffies_to_usecs(inet_csk(sk)->icsk_rto) : tcp_rto_delta_us(sk); /* How far in future is RTO? */ if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, true); return true; } /* Thanks to skb fast clones, we can detect if a prior transmit of * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! */ static bool skb_still_in_host_queue(struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); smp_mb__after_atomic(); if (skb_fclone_busy(sk, skb)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; } } return false; } /* When probe timeout (PTO) fires, try send a new segment if possible, else * retransmit the last segment. */ void tcp_send_loss_probe(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int pcount; int mss = tcp_current_mss(sk); /* At most one outstanding TLP */ if (tp->tlp_high_seq) goto rearm_timer; tp->tlp_retrans = 0; skb = tcp_send_head(sk); if (skb && tcp_snd_wnd_test(tp, skb, mss)) { pcount = tp->packets_out; tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); if (tp->packets_out > pcount) goto probe_sent; goto rearm_timer; } skb = skb_rb_last(&sk->tcp_rtx_queue); if (unlikely(!skb)) { tcp_warn_once(sk, tp->packets_out, "invalid inflight: "); smp_store_release(&inet_csk(sk)->icsk_pending, 0); return; } if (skb_still_in_host_queue(sk, skb)) goto rearm_timer; pcount = tcp_skb_pcount(skb); if (WARN_ON(!pcount)) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; skb = skb_rb_next(skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; tp->tlp_retrans = 1; probe_sent: /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ smp_store_release(&inet_csk(sk)->icsk_pending, 0); rearm_timer: tcp_rearm_rto(sk); } /* Push out any pending frames which were held back due to * TCP_CORK or attempt at coalescing tiny packets. * The socket must be locked by the caller. */ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle) { /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and * all will be happy. */ if (unlikely(sk->sk_state == TCP_CLOSE)) return; if (tcp_write_xmit(sk, cur_mss, nonagle, 0, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. */ void tcp_push_one(struct sock *sk, unsigned int mss_now) { struct sk_buff *skb = tcp_send_head(sk); BUG_ON(!skb || skb->len < mss_now); tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); } /* This function returns the amount that we can raise the * usable window based on the following constraints * * 1. The window can never be shrunk once it is offered (RFC 793) * 2. We limit memory per socket * * RFC 1122: * "the suggested [SWS] avoidance algorithm for the receiver is to keep * RECV.NEXT + RCV.WIN fixed until: * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" * * i.e. don't raise the right edge of the window until you can raise * it at least MSS bytes. * * Unfortunately, the recommended algorithm breaks header prediction, * since header prediction assumes th->window stays fixed. * * Strictly speaking, keeping th->window fixed violates the receiver * side SWS prevention criteria. The problem is that under this rule * a stream of single byte packets will cause the right side of the * window to always advance by a single byte. * * Of course, if the sender implements sender side SWS prevention * then this will not be a problem. * * BSD seems to make the following compromise: * * If the free space is less than the 1/4 of the maximum * space available and the free space is less than 1/2 mss, * then set the window to 0. * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ] * Otherwise, just prevent the window from shrinking * and from being larger than the largest representable value. * * This prevents incremental opening of the window in the regime * where TCP is limited by the speed of the reader side taking * data out of the TCP receive queue. It does nothing about * those cases where the window is constrained on the sender side * because the pipeline is full. * * BSD also seems to "accidentally" limit itself to windows that are a * multiple of MSS, at least until the free space gets quite small. * This would appear to be a side effect of the mbuf implementation. * Combining these two algorithms results in the observed behavior * of having a fixed window size at almost all times. * * Below we obtain similar behavior by forcing the offered window to * a multiple of the mss when it is feasible to do so. * * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. * Regular options like TIMESTAMP are taken into account. */ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int allowed_space = tcp_full_space(sk); int full_space, window; if (sk_is_mptcp(sk)) mptcp_space(sk, &free_space, &allowed_space); full_space = min_t(int, tp->window_clamp, allowed_space); if (unlikely(mss > full_space)) { mss = full_space; if (mss <= 0) return 0; } /* Only allow window shrink if the sysctl is enabled and we have * a non-zero scaling factor in effect. */ if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale) goto shrink_window_allowed; /* do not allow window to shrink */ if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); /* free_space might become our new window, make sure we don't * increase it due to wscale. */ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); /* if free space is less than mss estimate, or is below 1/16th * of the maximum allowed, try to move to zero-window, else * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and * new incoming data is dropped due to memory limits. * With large window, mss test triggers way too late in order * to announce zero window in time before rmem limit kicks in. */ if (free_space < (allowed_space >> 4) || free_space < mss) return 0; } if (free_space > tp->rcv_ssthresh) free_space = tp->rcv_ssthresh; /* Don't do rounding if we are using window scaling, since the * scaled window will not line up with the MSS boundary anyway. */ if (tp->rx_opt.rcv_wscale) { window = free_space; /* Advertise enough space so that it won't get scaled away. * Import case: prevent zero window announcement if * 1<<rcv_wscale > mss. */ window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale)); } else { window = tp->rcv_wnd; /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. * If our current window offering is within 1 mss of the * free space we just keep it. This prevents the divide * and multiply from happening most of the time. * We also don't do any window rounding when the free space * is too small. */ if (window <= free_space - mss || window > free_space) window = rounddown(free_space, mss); else if (mss == full_space && free_space > window + (full_space >> 1)) window = free_space; } return window; shrink_window_allowed: /* new window should always be an exact multiple of scaling factor */ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale); if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) tcp_adjust_rcv_ssthresh(sk); /* if free space is too low, return a zero window */ if (free_space < (allowed_space >> 4) || free_space < mss || free_space < (1 << tp->rx_opt.rcv_wscale)) return 0; } if (free_space > tp->rcv_ssthresh) { free_space = tp->rcv_ssthresh; /* new window should always be an exact multiple of scaling factor * * For this case, we ALIGN "up" (increase free_space) because * we know free_space is not zero here, it has been reduced from * the memory-based limit, and rcv_ssthresh is not a hard limit * (unlike sk_rcvbuf). */ free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale)); } return free_space; } void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb) { if (unlikely(tcp_has_tx_tstamp(next_skb))) { const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); struct skb_shared_info *shinfo = skb_shinfo(skb); shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tskey = next_shinfo->tskey; TCP_SKB_CB(skb)->txstamp_ack |= TCP_SKB_CB(next_skb)->txstamp_ack; } } /* Collapses two adjacent SKB's during retransmission. */ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = skb_rb_next(skb); int next_skb_size; next_skb_size = next_skb->len; BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size)) return false; tcp_highest_sack_replace(sk, next_skb, skb); /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; /* Merge over control information. This moves PSH/FIN etc. over */ TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags; /* All done, get rid of second SKB and account for it so * packet counting does not break. */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; /* changed transmit queue under us so clear hints */ tcp_clear_retrans_hints_partial(tp); if (next_skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = skb; tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb)); tcp_skb_collapse_tstamp(skb, next_skb); tcp_rtx_queue_unlink_and_free(next_skb, sk); return true; } /* Check if coalescing SKBs is legal. */ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) { if (tcp_skb_pcount(skb) > 1) return false; if (skb_cloned(skb)) return false; if (!skb_frags_readable(skb)) return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; return true; } /* Collapse packets in the retransmit queue to make to create * less packets on the wire. This is only done on retransmission. */ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, int space) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = to, *tmp; bool first = true; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)) return; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; skb_rbtree_walk_from_safe(skb, tmp) { if (!tcp_can_collapse(sk, skb)) break; if (!tcp_skb_can_collapse(to, skb)) break; space -= skb->len; if (first) { first = false; continue; } if (space < 0) break; if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) break; if (!tcp_collapse_retrans(sk, to)) break; } } /* This retransmits one SKB. Policy decisions and retransmit queue * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss; int diff, len, err; int avail_wnd; /* Inconclusive MTU probe */ if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; if (skb_still_in_host_queue(sk, skb)) return -EBUSY; start: if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; TCP_SKB_CB(skb)->seq++; goto start; } if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) { WARN_ON_ONCE(1); return -EINVAL; } if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return -ENOMEM; } if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ cur_mss = tcp_current_mss(sk); avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the * case, when window is shrunk to zero. In this case * our retransmit of one segment serves as a zero window probe. */ if (avail_wnd <= 0) { if (TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; avail_wnd = cur_mss; } len = cur_mss * segs; if (len > avail_wnd) { len = rounddown(avail_wnd, cur_mss); if (!len) len = avail_wnd; } if (skb->len > len) { if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) return -ENOMEM; diff = tcp_skb_pcount(skb); tcp_set_skb_tso_segs(skb, cur_mss); diff -= tcp_skb_pcount(skb); if (diff) tcp_adjust_pcount(sk, skb, diff); avail_wnd = min_t(int, avail_wnd, cur_mss); if (skb->len < avail_wnd) tcp_retrans_try_collapse(sk, skb, avail_wnd); } /* RFC3168, section 6.1.1.1. ECN fallback */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); /* Update global and local TCP statistics. */ segs = tcp_skb_pcount(skb); TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans += segs; tp->bytes_retrans += skb->len; /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom * beyond what csum_start can cover. */ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; tcp_skb_tsorted_save(skb) { nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); if (nskb) { nskb->dev = NULL; err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC); } else { err = -ENOBUFS; } } tcp_skb_tsorted_restore(skb); if (!err) { tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns); tcp_rate_skb_sent(sk, skb); } } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); if (likely(!err)) { trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); } /* To avoid taking spuriously low RTT samples based on a timestamp * for a transmit that never happened, always mark EVER_RETRANS */ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; return err; } int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct tcp_sock *tp = tcp_sk(sk); int err = __tcp_retransmit_skb(sk, skb, segs); if (err == 0) { #if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { net_dbg_ratelimited("retrans_out leaked\n"); } #endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); } /* Save stamp of the first (attempted) retransmit. */ if (!tp->retrans_stamp) tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb); if (tp->undo_retrans < 0) tp->undo_retrans = 0; tp->undo_retrans += tcp_skb_pcount(skb); return err; } /* This gets called after a retransmit timeout, and the initially * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either * we've sent it all or the congestion window limit is reached. */ void tcp_xmit_retransmit_queue(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); bool rearm_timer = false; u32 max_segs; int mib_idx; if (!tp->packets_out) return; rtx_head = tcp_rtx_queue_head(sk); skb = tp->retransmit_skb_hint ?: rtx_head; max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); skb_rbtree_walk_from(skb) { __u8 sacked; int segs; if (tcp_pacing_check(sk)) break; /* we could do better than to assign each time */ if (!hole) tp->retransmit_skb_hint = skb; segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp); if (segs <= 0) break; sacked = TCP_SKB_CB(skb)->sacked; /* In case tcp_shift_skb_data() have aggregated large skbs, * we need to make sure not sending too bigs TSO packets */ segs = min_t(int, segs, max_segs); if (tp->retrans_out >= tp->lost_out) { break; } else if (!(sacked & TCPCB_LOST)) { if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; } else { if (icsk->icsk_ca_state != TCP_CA_Loss) mib_idx = LINUX_MIB_TCPFASTRETRANS; else mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; } if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; if (tcp_small_queue_check(sk, skb, 1)) break; if (tcp_retransmit_skb(sk, skb, segs)) break; NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) rearm_timer = true; } if (rearm_timer) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, true); } /* We allow to exceed memory limits for FIN packets to expedite * connection tear down and (memory) recovery. * Otherwise tcp_send_fin() could be tempted to either delay FIN * or even be forced to close flow without any FIN. * In general, we want to allow one skb per socket to avoid hangs * with edge trigger epoll() */ void sk_forced_mem_schedule(struct sock *sk, int size) { int delta, amt; delta = size - sk->sk_forward_alloc; if (delta <= 0) return; amt = sk_mem_pages(delta); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_charge_skmem(sk->sk_memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } /* Send a FIN. The caller locks the socket for us. * We should try to send a FIN packet really hard, but eventually give up. */ void tcp_send_fin(struct sock *sk) { struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); /* Optimization, tack on the FIN if we have one skb in write queue and * this skb was not yet sent, or we are under memory pressure. * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ tskb = tail; if (!tskb && tcp_under_memory_pressure(sk)) tskb = skb_rb_last(&sk->tcp_rtx_queue); if (tskb) { TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; if (!tail) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have * if FIN had been sent. This is because retransmit path * does not change tp->snd_nxt. */ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1); return; } } else { skb = alloc_skb_fclone(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (unlikely(!skb)) return; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); } /* We get here when a process closes a file descriptor (either due to * an explicit close() or as a byproduct of exit()'ing) and there * was unread data in the receive queue. This behavior is recommended * by RFC 2525, section 2.17. -DaveM */ void tcp_send_active_reset(struct sock *sk, gfp_t priority, enum sk_rst_reason reason) { struct sk_buff *skb; TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); /* skb of trace_tcp_send_reset() keeps the skb that caused RST, * skb here is different to the troublesome skb, so use NULL */ trace_tcp_send_reset(sk, NULL, reason); } /* Send a crossed SYN-ACK during socket establishment. * WARNING: This routine must only be called when we have already sent * a SYN packet that crossed the incoming SYN that caused this routine * to get called. If this assumption fails then the initial rcv_wnd * and rcv_wscale values will not be correct. */ int tcp_send_synack(struct sock *sk) { struct sk_buff *skb; skb = tcp_rtx_queue_head(sk); if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb; tcp_skb_tsorted_save(skb) { nskb = skb_copy(skb, GFP_ATOMIC); } tcp_skb_tsorted_restore(skb); if (!nskb) return -ENOMEM; INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); tcp_highest_sack_replace(sk, skb, nskb); tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); sk_wmem_queued_add(sk, nskb->truesize); sk_mem_charge(sk, nskb->truesize); skb = nskb; } TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; tcp_ecn_send_synack(sk, skb); } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } /** * tcp_make_synack - Allocate one skb and build a SYNACK packet. * @sk: listener socket * @dst: dst entry attached to the SYNACK. It is consumed and caller * should not use it again. * @req: request_sock pointer * @foc: cookie for tcp fast open * @synack_type: Type of synack to prepare * @syn_skb: SYN packet just received. It could be NULL for rtx case. */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); struct tcp_out_options opts; struct tcp_key key = {}; struct sk_buff *skb; int tcp_header_size; struct tcphdr *th; int mss; u64 now; skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; } /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); switch (synack_type) { case TCP_SYNACK_NORMAL: skb_set_owner_edemux(skb, req_to_sk(req)); break; case TCP_SYNACK_COOKIE: /* Under synflood, we do not attach skb to a socket, * to avoid false sharing. */ break; case TCP_SYNACK_FASTOPEN: /* sk is a const pointer, because we want to express multiple * cpu might call us concurrently. * sk->sk_wmem_alloc in an atomic, we can promote to rw. */ skb_set_owner_w(skb, (struct sock *)sk); break; } skb_dst_set(skb, dst); mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); memset(&opts, 0, sizeof(opts)); now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) skb_set_delivery_time(skb, cookie_init_timestamp(req, now), SKB_CLOCK_MONOTONIC); else #endif { skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_lock(); #endif if (tcp_rsk_used_ao(req)) { #ifdef CONFIG_TCP_AO struct tcp_ao_key *ao_key = NULL; u8 keyid = tcp_rsk(req)->ao_keyid; u8 rnext = tcp_rsk(req)->ao_rcv_next; ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req), keyid, -1); /* If there is no matching key - avoid sending anything, * especially usigned segments. It could try harder and lookup * for another peer-matching key, but the peer has requested * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here. */ if (unlikely(!ao_key)) { trace_tcp_ao_synack_no_key(sk, keyid, rnext); rcu_read_unlock(); kfree_skb(skb); net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n", keyid); return NULL; } key.ao_key = ao_key; key.type = TCP_KEY_AO; #endif } else { #ifdef CONFIG_TCP_MD5SIG key.md5_key = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4); /* bpf program will be interested in the tcp_flags */ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &key, foc, synack_type, syn_skb) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); th = (struct tcphdr *)skb->data; memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; tcp_ecn_make_synack(req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; skb->mark = ireq->ir_mark; skb->ip_summed = CHECKSUM_PARTIAL; th->seq = htonl(tcp_rsk(req)->snt_isn); /* XXX data is queued and acked as is. No buffer/window check */ th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write(th, NULL, tcp_rsk(req), &opts, &key); th->doff = (tcp_header_size >> 2); TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); /* Okay, we have all we need - do the md5 hash if needed */ if (tcp_key_is_md5(&key)) { #ifdef CONFIG_TCP_MD5SIG tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, key.md5_key, req_to_sk(req), skb); #endif } else if (tcp_key_is_ao(&key)) { #ifdef CONFIG_TCP_AO tcp_rsk(req)->af_specific->ao_synack_hash(opts.hash_location, key.ao_key, req, skb, opts.hash_location - (u8 *)th, 0); #endif } #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_unlock(); #endif bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, synack_type, &opts); skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); tcp_add_tx_delay(skb, tp); return skb; } EXPORT_IPV6_MOD(tcp_make_synack); static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); if (ca_key == TCP_CA_UNSPEC) return; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && bpf_try_module_get(ca, ca->owner))) { bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; } rcu_read_unlock(); } /* Do all connect socket setups that can be done AF independent. */ static void tcp_connect_init(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; u32 rcv_wnd; /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ tp->tcp_header_len = sizeof(struct tcphdr); if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps)) tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; tcp_ao_connect_init(sk); /* If user gave his TCP_MAXSEG, record it to clamp */ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; tp->max_window = 0; tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); tcp_ca_dst_init(sk, dst); if (!tp->window_clamp) WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW)); tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); tcp_initialize_rcv_mss(sk); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) WRITE_ONCE(tp->window_clamp, tcp_full_space(sk)); rcv_wnd = tcp_rwnd_init_bpf(sk); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); tcp_select_initial_window(sk, tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling), &rcv_wscale, rcv_wnd); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; WRITE_ONCE(sk->sk_err, 0); sock_reset_flag(sk, SOCK_DONE); tp->snd_wnd = 0; tcp_init_wl(tp, 0); tcp_write_queue_purge(sk); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; tp->snd_up = tp->write_seq; WRITE_ONCE(tp->snd_nxt, tp->write_seq); if (likely(!tp->repair)) tp->rcv_nxt = 0; else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->end_seq += skb->len; __skb_header_release(skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); WRITE_ONCE(tp->write_seq, tcb->end_seq); tp->packets_out += tcp_skb_pcount(skb); } /* Build and send a SYN with data and (cached) Fast Open cookie. However, * queue a data-only packet after the regular SYN, such that regular SYNs * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges * only the SYN sequence, the data are retransmitted in the first ACK. * If cookie is not cached or other error occurs, falls back to send a * regular SYN with Fast Open cookie request option. */ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_request *fo = tp->fastopen_req; struct page_frag *pfrag = sk_page_frag(sk); struct sk_buff *syn_data; int space, err = 0; tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie)) goto fallback; /* MSS for SYN-data is based on cached MSS and bounded by PMTU and * user-MSS. Reserve maximum option space for middleboxes that add * private TCP options. The cost is reduced data space in SYN :( */ tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp); /* Sync mss_cache after updating the mss_clamp */ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) - MAX_TCP_OPTION_SPACE; space = min_t(size_t, space, fo->size); if (space && !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE), pfrag, sk->sk_allocation)) goto fallback; syn_data = tcp_stream_alloc_skb(sk, sk->sk_allocation, false); if (!syn_data) goto fallback; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); if (space) { space = min_t(size_t, space, pfrag->size - pfrag->offset); space = tcp_wmem_schedule(sk, space); } if (space) { space = copy_page_from_iter(pfrag->page, pfrag->offset, space, &fo->data->msg_iter); if (unlikely(!space)) { tcp_skb_tsorted_anchor_cleanup(syn_data); kfree_skb(syn_data); goto fallback; } skb_fill_page_desc(syn_data, 0, pfrag->page, pfrag->offset, space); page_ref_inc(pfrag->page); pfrag->offset += space; skb_len_add(syn_data, space); skb_zcopy_set(syn_data, fo->uarg, NULL); } /* No more data pending in inet_wait_for_connect() */ if (space == fo->size) fo->data = NULL; fo->copied = space; tcp_connect_queue_skb(sk, syn_data); if (syn_data->len) tcp_chrono_start(sk, TCP_CHRONO_BUSY); err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC); /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) * we keep in write queue in case of a retransmit, as we * also have the SYN packet (with no data) in the same queue. */ TCP_SKB_CB(syn_data)->seq++; TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; if (!err) { tp->syn_data = (fo->copied > 0); tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } /* data was not sent, put it in write_queue */ __skb_queue_tail(&sk->sk_write_queue, syn_data); tp->packets_out -= tcp_skb_pcount(syn_data); fallback: /* Send a regular SYN with Fast Open cookie request option */ if (fo->cookie.len > 0) fo->cookie.len = 0; err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation); if (err) tp->syn_fastopen = 0; done: fo->cookie.len = -1; /* Exclude Fast Open option for SYN retries */ return err; } /* Build a SYN and send it off. */ int tcp_connect(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int err; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL); #if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO) /* Has to be checked late, after setting daddr/saddr/ops. * Return error if the peer has both a md5 and a tcp-ao key * configured as this is ambiguous. */ if (unlikely(rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)))) { bool needs_ao = !!tp->af_specific->ao_lookup(sk, sk, -1, -1); bool needs_md5 = !!tp->af_specific->md5_lookup(sk, sk); struct tcp_ao_info *ao_info; ao_info = rcu_dereference_check(tp->ao_info, lockdep_sock_is_held(sk)); if (ao_info) { /* This is an extra check: tcp_ao_required() in * tcp_v{4,6}_parse_md5_keys() should prevent adding * md5 keys on ao_required socket. */ needs_ao |= ao_info->ao_required; WARN_ON_ONCE(ao_info->ao_required && needs_md5); } if (needs_md5 && needs_ao) return -EKEYREJECTED; /* If we have a matching md5 key and no matching tcp-ao key * then free up ao_info if allocated. */ if (needs_md5) { tcp_ao_destroy_sock(sk, false); } else if (needs_ao) { tcp_clear_md5_list(sk); kfree(rcu_replace_pointer(tp->md5sig_info, NULL, lockdep_sock_is_held(sk))); } } #endif #ifdef CONFIG_TCP_AO if (unlikely(rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held(sk)))) { /* Don't allow connecting if ao is configured but no * matching key is found. */ if (!tp->af_specific->ao_lookup(sk, sk, -1, -1)) return -EKEYREJECTED; } #endif if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ tcp_connect_init(sk); if (unlikely(tp->repair)) { tcp_finish_connect(sk, NULL); return 0; } buff = tcp_stream_alloc_skb(sk, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; /* SYN eats a sequence byte, write_seq updated by * tcp_connect_queue_skb(). */ tcp_init_nondata_skb(buff, tp->write_seq, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); if (err == -ECONNREFUSED) return err; /* We change tp->snd_nxt after the tcp_transmit_skb() call * in order to make this packet get counted in tcpOutSegs. */ WRITE_ONCE(tp->snd_nxt, tp->write_seq); tp->pushed_seq = tp->write_seq; buff = tcp_send_head(sk); if (unlikely(buff)) { WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq); tp->pushed_seq = TCP_SKB_CB(buff)->seq; } TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, false); return 0; } EXPORT_SYMBOL(tcp_connect); u32 tcp_delack_max(const struct sock *sk) { u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1; return min(READ_ONCE(inet_csk(sk)->icsk_delack_max), delack_from_rto_min); } /* Send out a delayed ack, the caller does the policy checking * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() * for details. */ void tcp_send_delayed_ack(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int ato = icsk->icsk_ack.ato; unsigned long timeout; if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; if (inet_csk_in_pingpong_mode(sk) || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; /* Slow path, intersegment interval is "high". */ /* If some rtt estimate is known, use it to bound delayed ack. * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ if (tp->srtt_us) { int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3), TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; } ato = min(ato, max_ato); } ato = min_t(u32, ato, tcp_delack_max(sk)); /* Stay within the limit we were given */ timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer is about to expire, send ACK now. */ if (time_before_eq(icsk_delack_timeout(icsk), jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } if (!time_before(timeout, icsk_delack_timeout(icsk))) timeout = icsk_delack_timeout(icsk); } smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } /* This routine sends an ack and also updates the window. */ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags) { struct sk_buff *buff; /* If we have been reset, we may not send again. */ if (sk->sk_state == TCP_CLOSE) return; /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. */ buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (unlikely(!buff)) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned long delay; delay = TCP_DELACK_MAX << icsk->icsk_ack.retry; if (delay < tcp_rto_max(sk)) icsk->icsk_ack.retry++; inet_csk_schedule_ack(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, false); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK | flags); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784 */ skb_set_tcp_pure_ack(buff); /* Send it off, this clears delayed acks for us. */ __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); } EXPORT_SYMBOL_GPL(__tcp_send_ack); void tcp_send_ack(struct sock *sk) { __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt, 0); } /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. * * Question: what should we make while urgent mode? * 4.4BSD forces sending single byte of data. We cannot send * out of window data, because we have SND.NXT==SND.MAX... * * Current solution: to send TWO zero-length segments in urgent mode: * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is * out-of-date with SND.UNA-1 to probe window. */ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN)); if (!skb) return -1; /* Reserve space for headers and set control bits. */ skb_reserve(skb, MAX_TCP_HEADER); /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } /* Called from setsockopt( ... TCP_REPAIR ) */ void tcp_send_window_probe(struct sock *sk) { if (sk->sk_state == TCP_ESTABLISHED) { tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1; tcp_mstamp_refresh(tcp_sk(sk)); tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE); } } /* Initiate keepalive or window probe from timer. */ int tcp_write_wakeup(struct sock *sk, int mib) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; if (sk->sk_state == TCP_CLOSE) return -1; skb = tcp_send_head(sk); if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { int err; unsigned int mss = tcp_current_mss(sk); unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; /* We are probing the opening of a window * but the window size is != 0 * must have been a result SWS avoidance ( sender ) */ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(skb, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) tcp_event_new_data_sent(sk, skb); return err; } else { if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) tcp_xmit_probe_skb(sk, 1, mib); return tcp_xmit_probe_skb(sk, 0, mib); } } /* A window probe timeout has occurred. If window is not closed send * a partial packet else a zero probe. */ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); unsigned long timeout; int err; err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; icsk->icsk_probes_tstamp = 0; return; } icsk->icsk_probes_out++; if (err <= 0) { if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; timeout = tcp_probe0_when(sk, tcp_rto_max(sk)); } else { /* If packet was not sent due to local congestion, * Let senders fight for local resources conservatively. */ timeout = TCP_RESOURCE_PROBE_INTERVAL; } timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, true); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) { const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific; struct flowi fl; int res; /* Paired with WRITE_ONCE() in sock_setsockopt() */ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash()); res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, NULL); if (!res) { TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); if (unlikely(tcp_passive_fastopen(sk))) { /* sk has const attribute because listeners are lockless. * However in this case, we are dealing with a passive fastopen * socket thus we can change total_retrans value. */ tcp_sk_rw(sk)->total_retrans++; } trace_tcp_retransmit_synack(sk, req); } return res; } EXPORT_IPV6_MOD(tcp_rtx_synack); |
5 15 187 388 388 638 10 67 589 274 13 53 612 290 407 15 507 507 1012 19 655 506 527 8 12 494 647 502 769 248 251 520 12 27 32 39 658 562 112 289 659 401 759 141 129 12 4 9 133 2 86 103 103 104 15 57 804 794 790 116 114 20 36 36 1 35 35 1 80 79 80 1 80 79 80 57 57 57 57 472 259 277 68 68 3 3 79 62 133 133 33 395 33 425 233 234 79 8 48 106 74 3 107 1 107 110 73 73 47 4 72 14 13 2 13 13 14 13 551 53 624 86 4 87 86 83 82 86 13 13 8 96 7 9 9 7 8 26 21 21 25 25 11 24 17 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/seq_file.c * * helper functions for making synthetic files from sequences of records. * initial implementation -- AV, Oct 2001. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cache.h> #include <linux/fs.h> #include <linux/export.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/mm.h> #include <linux/printk.h> #include <linux/string_helpers.h> #include <linux/uio.h> #include <linux/uaccess.h> #include <asm/page.h> static struct kmem_cache *seq_file_cache __ro_after_init; static void seq_set_overflow(struct seq_file *m) { m->count = m->size; } static void *seq_buf_alloc(unsigned long size) { if (unlikely(size > MAX_RW_COUNT)) return NULL; return kvmalloc(size, GFP_KERNEL_ACCOUNT); } /** * seq_open - initialize sequential file * @file: file we initialize * @op: method table describing the sequence * * seq_open() sets @file, associating it with a sequence described * by @op. @op->start() sets the iterator up and returns the first * element of sequence. @op->stop() shuts it down. @op->next() * returns the next element of sequence. @op->show() prints element * into the buffer. In case of error ->start() and ->next() return * ERR_PTR(error). In the end of sequence they return %NULL. ->show() * returns 0 in case of success and negative number in case of error. * Returning SEQ_SKIP means "discard this element and move on". * Note: seq_open() will allocate a struct seq_file and store its * pointer in @file->private_data. This pointer should not be modified. */ int seq_open(struct file *file, const struct seq_operations *op) { struct seq_file *p; WARN_ON(file->private_data); p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL); if (!p) return -ENOMEM; file->private_data = p; mutex_init(&p->lock); p->op = op; // No refcounting: the lifetime of 'p' is constrained // to the lifetime of the file. p->file = file; /* * seq_files support lseek() and pread(). They do not implement * write() at all, but we clear FMODE_PWRITE here for historical * reasons. * * If a client of seq_files a) implements file.write() and b) wishes to * support pwrite() then that client will need to implement its own * file.open() which calls seq_open() and then sets FMODE_PWRITE. */ file->f_mode &= ~FMODE_PWRITE; return 0; } EXPORT_SYMBOL(seq_open); static int traverse(struct seq_file *m, loff_t offset) { loff_t pos = 0; int error = 0; void *p; m->index = 0; m->count = m->from = 0; if (!offset) return 0; if (!m->buf) { m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } p = m->op->start(m, &m->index); while (p) { error = PTR_ERR(p); if (IS_ERR(p)) break; error = m->op->show(m, p); if (error < 0) break; if (unlikely(error)) { error = 0; m->count = 0; } if (seq_has_overflowed(m)) goto Eoverflow; p = m->op->next(m, p, &m->index); if (pos + m->count > offset) { m->from = offset - pos; m->count -= m->from; break; } pos += m->count; m->count = 0; if (pos == offset) break; } m->op->stop(m, p); return error; Eoverflow: m->op->stop(m, p); kvfree(m->buf); m->count = 0; m->buf = seq_buf_alloc(m->size <<= 1); return !m->buf ? -ENOMEM : -EAGAIN; } /** * seq_read - ->read() method for sequential files. * @file: the file to read from * @buf: the buffer to read to * @size: the maximum number of bytes to read * @ppos: the current position in the file * * Ready-made ->f_op->read() */ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = size}; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, file); iov_iter_init(&iter, ITER_DEST, &iov, 1, size); kiocb.ki_pos = *ppos; ret = seq_read_iter(&kiocb, &iter); *ppos = kiocb.ki_pos; return ret; } EXPORT_SYMBOL(seq_read); /* * Ready-made ->f_op->read_iter() */ ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct seq_file *m = iocb->ki_filp->private_data; size_t copied = 0; size_t n; void *p; int err = 0; if (!iov_iter_count(iter)) return 0; mutex_lock(&m->lock); /* * if request is to read from zero offset, reset iterator to first * record as it might have been already advanced by previous requests */ if (iocb->ki_pos == 0) { m->index = 0; m->count = 0; } /* Don't assume ki_pos is where we left it */ if (unlikely(iocb->ki_pos != m->read_pos)) { while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN) ; if (err) { /* With prejudice... */ m->read_pos = 0; m->index = 0; m->count = 0; goto Done; } else { m->read_pos = iocb->ki_pos; } } /* grab buffer if we didn't have one */ if (!m->buf) { m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) goto Enomem; } // something left in the buffer - copy it out first if (m->count) { n = copy_to_iter(m->buf + m->from, m->count, iter); m->count -= n; m->from += n; copied += n; if (m->count) // hadn't managed to copy everything goto Done; } // get a non-empty record in the buffer m->from = 0; p = m->op->start(m, &m->index); while (1) { err = PTR_ERR(p); if (!p || IS_ERR(p)) // EOF or an error break; err = m->op->show(m, p); if (err < 0) // hard error break; if (unlikely(err)) // ->show() says "skip it" m->count = 0; if (unlikely(!m->count)) { // empty record p = m->op->next(m, p, &m->index); continue; } if (!seq_has_overflowed(m)) // got it goto Fill; // need a bigger buffer m->op->stop(m, p); kvfree(m->buf); m->count = 0; m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; p = m->op->start(m, &m->index); } // EOF or an error m->op->stop(m, p); m->count = 0; goto Done; Fill: // one non-empty record is in the buffer; if they want more, // try to fit more in, but in any case we need to advance // the iterator once for every record shown. while (1) { size_t offs = m->count; loff_t pos = m->index; p = m->op->next(m, p, &m->index); if (pos == m->index) { pr_info_ratelimited("buggy .next function %ps did not update position index\n", m->op->next); m->index++; } if (!p || IS_ERR(p)) // no next record for us break; if (m->count >= iov_iter_count(iter)) break; err = m->op->show(m, p); if (err > 0) { // ->show() says "skip it" m->count = offs; } else if (err || seq_has_overflowed(m)) { m->count = offs; break; } } m->op->stop(m, p); n = copy_to_iter(m->buf, m->count, iter); copied += n; m->count -= n; m->from = n; Done: if (unlikely(!copied)) { copied = m->count ? -EFAULT : err; } else { iocb->ki_pos += copied; m->read_pos += copied; } mutex_unlock(&m->lock); return copied; Enomem: err = -ENOMEM; goto Done; } EXPORT_SYMBOL(seq_read_iter); /** * seq_lseek - ->llseek() method for sequential files. * @file: the file in question * @offset: new position * @whence: 0 for absolute, 1 for relative position * * Ready-made ->f_op->llseek() */ loff_t seq_lseek(struct file *file, loff_t offset, int whence) { struct seq_file *m = file->private_data; loff_t retval = -EINVAL; mutex_lock(&m->lock); switch (whence) { case SEEK_CUR: offset += file->f_pos; fallthrough; case SEEK_SET: if (offset < 0) break; retval = offset; if (offset != m->read_pos) { while ((retval = traverse(m, offset)) == -EAGAIN) ; if (retval) { /* with extreme prejudice... */ file->f_pos = 0; m->read_pos = 0; m->index = 0; m->count = 0; } else { m->read_pos = offset; retval = file->f_pos = offset; } } else { file->f_pos = offset; } } mutex_unlock(&m->lock); return retval; } EXPORT_SYMBOL(seq_lseek); /** * seq_release - free the structures associated with sequential file. * @inode: its inode * @file: file in question * * Frees the structures associated with sequential file; can be used * as ->f_op->release() if you don't have private data to destroy. */ int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; kvfree(m->buf); kmem_cache_free(seq_file_cache, m); return 0; } EXPORT_SYMBOL(seq_release); /** * seq_escape_mem - print data into buffer, escaping some characters * @m: target buffer * @src: source buffer * @len: size of source buffer * @flags: flags to pass to string_escape_mem() * @esc: set of characters that need escaping * * Puts data into buffer, replacing each occurrence of character from * given class (defined by @flags and @esc) with printable escaped sequence. * * Use seq_has_overflowed() to check for errors. */ void seq_escape_mem(struct seq_file *m, const char *src, size_t len, unsigned int flags, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int ret; ret = string_escape_mem(src, len, buf, size, flags, esc); seq_commit(m, ret < size ? ret : -1); } EXPORT_SYMBOL(seq_escape_mem); void seq_vprintf(struct seq_file *m, const char *f, va_list args) { int len; if (m->count < m->size) { len = vsnprintf(m->buf + m->count, m->size - m->count, f, args); if (m->count + len < m->size) { m->count += len; return; } } seq_set_overflow(m); } EXPORT_SYMBOL(seq_vprintf); void seq_printf(struct seq_file *m, const char *f, ...) { va_list args; va_start(args, f); seq_vprintf(m, f, args); va_end(args); } EXPORT_SYMBOL(seq_printf); #ifdef CONFIG_BINARY_PRINTF void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary) { int len; if (m->count < m->size) { len = bstr_printf(m->buf + m->count, m->size - m->count, f, binary); if (m->count + len < m->size) { m->count += len; return; } } seq_set_overflow(m); } EXPORT_SYMBOL(seq_bprintf); #endif /* CONFIG_BINARY_PRINTF */ /** * mangle_path - mangle and copy path to buffer beginning * @s: buffer start * @p: beginning of path in above buffer * @esc: set of characters that need escaping * * Copy the path from @p to @s, replacing each occurrence of character from * @esc with usual octal escape. * Returns pointer past last written character in @s, or NULL in case of * failure. */ char *mangle_path(char *s, const char *p, const char *esc) { while (s <= p) { char c = *p++; if (!c) { return s; } else if (!strchr(esc, c)) { *s++ = c; } else if (s + 4 > p) { break; } else { *s++ = '\\'; *s++ = '0' + ((c & 0300) >> 6); *s++ = '0' + ((c & 070) >> 3); *s++ = '0' + (c & 07); } } return NULL; } EXPORT_SYMBOL(mangle_path); /** * seq_path - seq_file interface to print a pathname * @m: the seq_file handle * @path: the struct path to print * @esc: set of characters to escape in the output * * return the absolute path of 'path', as represented by the * dentry / mnt pair in the path parameter. */ int seq_path(struct seq_file *m, const struct path *path, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int res = -1; if (size) { char *p = d_path(path, buf, size); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; } } seq_commit(m, res); return res; } EXPORT_SYMBOL(seq_path); /** * seq_file_path - seq_file interface to print a pathname of a file * @m: the seq_file handle * @file: the struct file to print * @esc: set of characters to escape in the output * * return the absolute path to the file. */ int seq_file_path(struct seq_file *m, struct file *file, const char *esc) { return seq_path(m, &file->f_path, esc); } EXPORT_SYMBOL(seq_file_path); /* * Same as seq_path, but relative to supplied root. */ int seq_path_root(struct seq_file *m, const struct path *path, const struct path *root, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int res = -ENAMETOOLONG; if (size) { char *p; p = __d_path(path, root, buf, size); if (!p) return SEQ_SKIP; res = PTR_ERR(p); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; else res = -ENAMETOOLONG; } } seq_commit(m, res); return res < 0 && res != -ENAMETOOLONG ? res : 0; } /* * returns the path of the 'dentry' from the root of its filesystem. */ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) { char *buf; size_t size = seq_get_buf(m, &buf); int res = -1; if (size) { char *p = dentry_path(dentry, buf, size); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; } } seq_commit(m, res); return res; } EXPORT_SYMBOL(seq_dentry); void *single_start(struct seq_file *p, loff_t *pos) { return *pos ? NULL : SEQ_START_TOKEN; } static void *single_next(struct seq_file *p, void *v, loff_t *pos) { ++*pos; return NULL; } static void single_stop(struct seq_file *p, void *v) { } int single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data) { struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT); int res = -ENOMEM; if (op) { op->start = single_start; op->next = single_next; op->stop = single_stop; op->show = show; res = seq_open(file, op); if (!res) ((struct seq_file *)file->private_data)->private = data; else kfree(op); } return res; } EXPORT_SYMBOL(single_open); int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), void *data, size_t size) { char *buf = seq_buf_alloc(size); int ret; if (!buf) return -ENOMEM; ret = single_open(file, show, data); if (ret) { kvfree(buf); return ret; } ((struct seq_file *)file->private_data)->buf = buf; ((struct seq_file *)file->private_data)->size = size; return 0; } EXPORT_SYMBOL(single_open_size); int single_release(struct inode *inode, struct file *file) { const struct seq_operations *op = ((struct seq_file *)file->private_data)->op; int res = seq_release(inode, file); kfree(op); return res; } EXPORT_SYMBOL(single_release); int seq_release_private(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; kfree(seq->private); seq->private = NULL; return seq_release(inode, file); } EXPORT_SYMBOL(seq_release_private); void *__seq_open_private(struct file *f, const struct seq_operations *ops, int psize) { int rc; void *private; struct seq_file *seq; private = kzalloc(psize, GFP_KERNEL_ACCOUNT); if (private == NULL) goto out; rc = seq_open(f, ops); if (rc < 0) goto out_free; seq = f->private_data; seq->private = private; return private; out_free: kfree(private); out: return NULL; } EXPORT_SYMBOL(__seq_open_private); int seq_open_private(struct file *filp, const struct seq_operations *ops, int psize) { return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM; } EXPORT_SYMBOL(seq_open_private); void seq_putc(struct seq_file *m, char c) { if (m->count >= m->size) return; m->buf[m->count++] = c; } EXPORT_SYMBOL(seq_putc); void __seq_puts(struct seq_file *m, const char *s) { seq_write(m, s, strlen(s)); } EXPORT_SYMBOL(__seq_puts); /** * seq_put_decimal_ull_width - A helper routine for putting decimal numbers * without rich format of printf(). * only 'unsigned long long' is supported. * @m: seq_file identifying the buffer to which data should be written * @delimiter: a string which is printed before the number * @num: the number * @width: a minimum field width * * This routine will put strlen(delimiter) + number into seq_filed. * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, unsigned long long num, unsigned int width) { int len; if (m->count + 2 >= m->size) /* we'll write 2 bytes at least */ goto overflow; if (delimiter && delimiter[0]) { if (delimiter[1] == 0) seq_putc(m, delimiter[0]); else seq_puts(m, delimiter); } if (!width) width = 1; if (m->count + width >= m->size) goto overflow; len = num_to_str(m->buf + m->count, m->size - m->count, num, width); if (!len) goto overflow; m->count += len; return; overflow: seq_set_overflow(m); } void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num) { return seq_put_decimal_ull_width(m, delimiter, num, 0); } EXPORT_SYMBOL(seq_put_decimal_ull); /** * seq_put_hex_ll - put a number in hexadecimal notation * @m: seq_file identifying the buffer to which data should be written * @delimiter: a string which is printed before the number * @v: the number * @width: a minimum field width * * seq_put_hex_ll(m, "", v, 8) is equal to seq_printf(m, "%08llx", v) * * This routine is very quick when you show lots of numbers. * In usual cases, it will be better to use seq_printf(). It's easier to read. */ void seq_put_hex_ll(struct seq_file *m, const char *delimiter, unsigned long long v, unsigned int width) { unsigned int len; int i; if (delimiter && delimiter[0]) { if (delimiter[1] == 0) seq_putc(m, delimiter[0]); else seq_puts(m, delimiter); } /* If x is 0, the result of __builtin_clzll is undefined */ if (v == 0) len = 1; else len = (sizeof(v) * 8 - __builtin_clzll(v) + 3) / 4; if (len < width) len = width; if (m->count + len > m->size) { seq_set_overflow(m); return; } for (i = len - 1; i >= 0; i--) { m->buf[m->count + i] = hex_asc[0xf & v]; v = v >> 4; } m->count += len; } void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num) { int len; if (m->count + 3 >= m->size) /* we'll write 2 bytes at least */ goto overflow; if (delimiter && delimiter[0]) { if (delimiter[1] == 0) seq_putc(m, delimiter[0]); else seq_puts(m, delimiter); } if (m->count + 2 >= m->size) goto overflow; if (num < 0) { m->buf[m->count++] = '-'; num = -num; } if (num < 10) { m->buf[m->count++] = num + '0'; return; } len = num_to_str(m->buf + m->count, m->size - m->count, num, 0); if (!len) goto overflow; m->count += len; return; overflow: seq_set_overflow(m); } EXPORT_SYMBOL(seq_put_decimal_ll); /** * seq_write - write arbitrary data to buffer * @seq: seq_file identifying the buffer to which data should be written * @data: data address * @len: number of bytes * * Return 0 on success, non-zero otherwise. */ int seq_write(struct seq_file *seq, const void *data, size_t len) { if (seq->count + len < seq->size) { memcpy(seq->buf + seq->count, data, len); seq->count += len; return 0; } seq_set_overflow(seq); return -1; } EXPORT_SYMBOL(seq_write); /** * seq_pad - write padding spaces to buffer * @m: seq_file identifying the buffer to which data should be written * @c: the byte to append after padding if non-zero */ void seq_pad(struct seq_file *m, char c) { int size = m->pad_until - m->count; if (size > 0) { if (size + m->count > m->size) { seq_set_overflow(m); return; } memset(m->buf + m->count, ' ', size); m->count += size; } if (c) seq_putc(m, c); } EXPORT_SYMBOL(seq_pad); /* A complete analogue of print_hex_dump() */ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { const u8 *ptr = buf; int i, linelen, remaining = len; char *buffer; size_t size; int ret; if (rowsize != 16 && rowsize != 32) rowsize = 16; for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) { linelen = min(remaining, rowsize); remaining -= rowsize; switch (prefix_type) { case DUMP_PREFIX_ADDRESS: seq_printf(m, "%s%p: ", prefix_str, ptr + i); break; case DUMP_PREFIX_OFFSET: seq_printf(m, "%s%.8x: ", prefix_str, i); break; default: seq_printf(m, "%s", prefix_str); break; } size = seq_get_buf(m, &buffer); ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, buffer, size, ascii); seq_commit(m, ret < size ? ret : -1); seq_putc(m, '\n'); } } EXPORT_SYMBOL(seq_hex_dump); struct list_head *seq_list_start(struct list_head *head, loff_t pos) { struct list_head *lh; list_for_each(lh, head) if (pos-- == 0) return lh; return NULL; } EXPORT_SYMBOL(seq_list_start); struct list_head *seq_list_start_head(struct list_head *head, loff_t pos) { if (!pos) return head; return seq_list_start(head, pos - 1); } EXPORT_SYMBOL(seq_list_start_head); struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos) { struct list_head *lh; lh = ((struct list_head *)v)->next; ++*ppos; return lh == head ? NULL : lh; } EXPORT_SYMBOL(seq_list_next); struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos) { struct list_head *lh; list_for_each_rcu(lh, head) if (pos-- == 0) return lh; return NULL; } EXPORT_SYMBOL(seq_list_start_rcu); struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos) { if (!pos) return head; return seq_list_start_rcu(head, pos - 1); } EXPORT_SYMBOL(seq_list_start_head_rcu); struct list_head *seq_list_next_rcu(void *v, struct list_head *head, loff_t *ppos) { struct list_head *lh; lh = list_next_rcu((struct list_head *)v); ++*ppos; return lh == head ? NULL : lh; } EXPORT_SYMBOL(seq_list_next_rcu); /** * seq_hlist_start - start an iteration of a hlist * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). */ struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos) { struct hlist_node *node; hlist_for_each(node, head) if (pos-- == 0) return node; return NULL; } EXPORT_SYMBOL(seq_hlist_start); /** * seq_hlist_start_head - start an iteration of a hlist * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). Call this function if you want to * print a header at the top of the output. */ struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos) { if (!pos) return SEQ_START_TOKEN; return seq_hlist_start(head, pos - 1); } EXPORT_SYMBOL(seq_hlist_start_head); /** * seq_hlist_next - move to the next position of the hlist * @v: the current iterator * @head: the head of the hlist * @ppos: the current position * * Called at seq_file->op->next(). */ struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, loff_t *ppos) { struct hlist_node *node = v; ++*ppos; if (v == SEQ_START_TOKEN) return head->first; else return node->next; } EXPORT_SYMBOL(seq_hlist_next); /** * seq_hlist_start_rcu - start an iteration of a hlist protected by RCU * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, loff_t pos) { struct hlist_node *node; __hlist_for_each_rcu(node, head) if (pos-- == 0) return node; return NULL; } EXPORT_SYMBOL(seq_hlist_start_rcu); /** * seq_hlist_start_head_rcu - start an iteration of a hlist protected by RCU * @head: the head of the hlist * @pos: the start position of the sequence * * Called at seq_file->op->start(). Call this function if you want to * print a header at the top of the output. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, loff_t pos) { if (!pos) return SEQ_START_TOKEN; return seq_hlist_start_rcu(head, pos - 1); } EXPORT_SYMBOL(seq_hlist_start_head_rcu); /** * seq_hlist_next_rcu - move to the next position of the hlist protected by RCU * @v: the current iterator * @head: the head of the hlist * @ppos: the current position * * Called at seq_file->op->next(). * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ struct hlist_node *seq_hlist_next_rcu(void *v, struct hlist_head *head, loff_t *ppos) { struct hlist_node *node = v; ++*ppos; if (v == SEQ_START_TOKEN) return rcu_dereference(head->first); else return rcu_dereference(node->next); } EXPORT_SYMBOL(seq_hlist_next_rcu); /** * seq_hlist_start_percpu - start an iteration of a percpu hlist array * @head: pointer to percpu array of struct hlist_heads * @cpu: pointer to cpu "cursor" * @pos: start position of sequence * * Called at seq_file->op->start(). */ struct hlist_node * seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos) { struct hlist_node *node; for_each_possible_cpu(*cpu) { hlist_for_each(node, per_cpu_ptr(head, *cpu)) { if (pos-- == 0) return node; } } return NULL; } EXPORT_SYMBOL(seq_hlist_start_percpu); /** * seq_hlist_next_percpu - move to the next position of the percpu hlist array * @v: pointer to current hlist_node * @head: pointer to percpu array of struct hlist_heads * @cpu: pointer to cpu "cursor" * @pos: start position of sequence * * Called at seq_file->op->next(). */ struct hlist_node * seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos) { struct hlist_node *node = v; ++*pos; if (node->next) return node->next; for (*cpu = cpumask_next(*cpu, cpu_possible_mask); *cpu < nr_cpu_ids; *cpu = cpumask_next(*cpu, cpu_possible_mask)) { struct hlist_head *bucket = per_cpu_ptr(head, *cpu); if (!hlist_empty(bucket)) return bucket->first; } return NULL; } EXPORT_SYMBOL(seq_hlist_next_percpu); void __init seq_file_init(void) { seq_file_cache = KMEM_CACHE(seq_file, SLAB_ACCOUNT|SLAB_PANIC); } |
1 1 1 1 1 1 1 1 1 1 4 4 1 1 32 32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 | /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/rculist.h> #include <linux/llist.h> #include "rds_single_path.h" #include "ib_mr.h" #include "rds.h" struct workqueue_struct *rds_ib_mr_wq; static void rds_ib_odp_mr_worker(struct work_struct *work); static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) { struct rds_ib_device *rds_ibdev; struct rds_ib_ipaddr *i_ipaddr; rcu_read_lock(); list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { refcount_inc(&rds_ibdev->refcount); rcu_read_unlock(); return rds_ibdev; } } } rcu_read_unlock(); return NULL; } static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_ipaddr *i_ipaddr; i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL); if (!i_ipaddr) return -ENOMEM; i_ipaddr->ipaddr = ipaddr; spin_lock_irq(&rds_ibdev->spinlock); list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list); spin_unlock_irq(&rds_ibdev->spinlock); return 0; } static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_ipaddr *i_ipaddr; struct rds_ib_ipaddr *to_free = NULL; spin_lock_irq(&rds_ibdev->spinlock); list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { list_del_rcu(&i_ipaddr->list); to_free = i_ipaddr; break; } } spin_unlock_irq(&rds_ibdev->spinlock); if (to_free) kfree_rcu(to_free, rcu); } int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, struct in6_addr *ipaddr) { struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]); if (!rds_ibdev_old) return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); if (rds_ibdev_old != rds_ibdev) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]); rds_ib_dev_put(rds_ibdev_old); return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); } rds_ib_dev_put(rds_ibdev_old); return 0; } void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; /* conn was previously on the nodev_conns_list */ spin_lock_irq(&ib_nodev_conns_lock); BUG_ON(list_empty(&ib_nodev_conns)); BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); spin_lock(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); spin_unlock(&rds_ibdev->spinlock); spin_unlock_irq(&ib_nodev_conns_lock); ic->rds_ibdev = rds_ibdev; refcount_inc(&rds_ibdev->refcount); } void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; /* place conn on nodev_conns_list */ spin_lock(&ib_nodev_conns_lock); spin_lock_irq(&rds_ibdev->spinlock); BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); spin_unlock_irq(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &ib_nodev_conns); spin_unlock(&ib_nodev_conns_lock); ic->rds_ibdev = NULL; rds_ib_dev_put(rds_ibdev); } void rds_ib_destroy_nodev_conns(void) { struct rds_ib_connection *ic, *_ic; LIST_HEAD(tmp_list); /* avoid calling conn_destroy with irqs off */ spin_lock_irq(&ib_nodev_conns_lock); list_splice(&ib_nodev_conns, &tmp_list); spin_unlock_irq(&ib_nodev_conns_lock); list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) rds_conn_destroy(ic->conn); } void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) { struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; iinfo->rdma_mr_max = pool_1m->max_items; iinfo->rdma_mr_size = pool_1m->max_pages; } #if IS_ENABLED(CONFIG_IPV6) void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds6_info_rdma_connection *iinfo6) { struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; iinfo6->rdma_mr_max = pool_1m->max_items; iinfo6->rdma_mr_size = pool_1m->max_pages; } #endif struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; struct llist_node *ret; unsigned long flags; spin_lock_irqsave(&pool->clean_lock, flags); ret = llist_del_first(&pool->clean_list); spin_unlock_irqrestore(&pool->clean_lock, flags); if (ret) { ibmr = llist_entry(ret, struct rds_ib_mr, llnode); if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_reused); else rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); } return ibmr; } void rds_ib_sync_mr(void *trans_private, int direction) { struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; if (ibmr->odp) return; switch (direction) { case DMA_FROM_DEVICE: ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, ibmr->sg_dma_len, DMA_BIDIRECTIONAL); break; case DMA_TO_DEVICE: ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg, ibmr->sg_dma_len, DMA_BIDIRECTIONAL); break; } } void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { struct rds_ib_device *rds_ibdev = ibmr->device; if (ibmr->sg_dma_len) { ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len, DMA_BIDIRECTIONAL); ibmr->sg_dma_len = 0; } /* Release the s/g list */ if (ibmr->sg_len) { unsigned int i; for (i = 0; i < ibmr->sg_len; ++i) { struct page *page = sg_page(&ibmr->sg[i]); /* FIXME we need a way to tell a r/w MR * from a r/o MR */ WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); put_page(page); } kfree(ibmr->sg); ibmr->sg = NULL; ibmr->sg_len = 0; } } void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { unsigned int pinned = ibmr->sg_len; __rds_ib_teardown_mr(ibmr); if (pinned) { struct rds_ib_mr_pool *pool = ibmr->pool; atomic_sub(pinned, &pool->free_pinned); } } static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all) { unsigned int item_count; item_count = atomic_read(&pool->item_count); if (free_all) return item_count; return 0; } /* * given an llist of mrs, put them all into the list_head for more processing */ static unsigned int llist_append_to_list(struct llist_head *llist, struct list_head *list) { struct rds_ib_mr *ibmr; struct llist_node *node; struct llist_node *next; unsigned int count = 0; node = llist_del_all(llist); while (node) { next = node->next; ibmr = llist_entry(node, struct rds_ib_mr, llnode); list_add_tail(&ibmr->unmap_list, list); node = next; count++; } return count; } /* * this takes a list head of mrs and turns it into linked llist nodes * of clusters. Each cluster has linked llist nodes of * MR_CLUSTER_SIZE mrs that are ready for reuse. */ static void list_to_llist_nodes(struct list_head *list, struct llist_node **nodes_head, struct llist_node **nodes_tail) { struct rds_ib_mr *ibmr; struct llist_node *cur = NULL; struct llist_node **next = nodes_head; list_for_each_entry(ibmr, list, unmap_list) { cur = &ibmr->llnode; *next = cur; next = &cur->next; } *next = NULL; *nodes_tail = cur; } /* * Flush our pool of MRs. * At a minimum, all currently unused MRs are unmapped. * If the number of MRs allocated exceeds the limit, we also try * to free as many MRs as needed to get back to this limit. */ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **ibmr_ret) { struct rds_ib_mr *ibmr; struct llist_node *clean_nodes; struct llist_node *clean_tail; LIST_HEAD(unmap_list); unsigned long unpinned = 0; unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush); if (ibmr_ret) { DEFINE_WAIT(wait); while (!mutex_trylock(&pool->flush_lock)) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); goto out_nolock; } prepare_to_wait(&pool->flush_wait, &wait, TASK_UNINTERRUPTIBLE); if (llist_empty(&pool->clean_list)) schedule(); ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); goto out_nolock; } } finish_wait(&pool->flush_wait, &wait); } else mutex_lock(&pool->flush_lock); if (ibmr_ret) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; goto out; } } /* Get the list of all MRs to be dropped. Ordering matters - * we want to put drop_list ahead of free_list. */ dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); if (free_all) { unsigned long flags; spin_lock_irqsave(&pool->clean_lock, flags); llist_append_to_list(&pool->clean_list, &unmap_list); spin_unlock_irqrestore(&pool->clean_lock, flags); } free_goal = rds_ib_flush_goal(pool, free_all); if (list_empty(&unmap_list)) goto out; rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); if (!list_empty(&unmap_list)) { unsigned long flags; list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail); if (ibmr_ret) { *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); clean_nodes = clean_nodes->next; } /* more than one entry in llist nodes */ if (clean_nodes) { spin_lock_irqsave(&pool->clean_lock, flags); llist_add_batch(clean_nodes, clean_tail, &pool->clean_list); spin_unlock_irqrestore(&pool->clean_lock, flags); } } atomic_sub(unpinned, &pool->free_pinned); atomic_sub(dirty_to_clean, &pool->dirty_count); atomic_sub(nfreed, &pool->item_count); out: mutex_unlock(&pool->flush_lock); if (waitqueue_active(&pool->flush_wait)) wake_up(&pool->flush_wait); out_nolock: return 0; } struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; int iter = 0; while (1) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) return ibmr; if (atomic_inc_return(&pool->item_count) <= pool->max_items) break; atomic_dec(&pool->item_count); if (++iter > 2) { if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); break; } /* We do have some empty MRs. Flush them out. */ if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); rds_ib_flush_mr_pool(pool, 0, &ibmr); if (ibmr) return ibmr; } return NULL; } static void rds_ib_mr_pool_flush_worker(struct work_struct *work) { struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); rds_ib_flush_mr_pool(pool, 0, NULL); } void rds_ib_free_mr(void *trans_private, int invalidate) { struct rds_ib_mr *ibmr = trans_private; struct rds_ib_mr_pool *pool = ibmr->pool; struct rds_ib_device *rds_ibdev = ibmr->device; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); if (ibmr->odp) { /* A MR created and marked as use_once. We use delayed work, * because there is a change that we are in interrupt and can't * call to ib_dereg_mr() directly. */ INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker); queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0); return; } /* Return it to the pool's free list */ rds_ib_free_frmr_list(ibmr); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 5) queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { rds_ib_flush_mr_pool(pool, 0, NULL); } else { /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); } } rds_ib_dev_put(rds_ibdev); } void rds_ib_flush_mrs(void) { struct rds_ib_device *rds_ibdev; down_read(&rds_ib_devices_lock); list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { if (rds_ibdev->mr_8k_pool) rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL); if (rds_ibdev->mr_1m_pool) rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL); } up_read(&rds_ib_devices_lock); } u32 rds_ib_get_lkey(void *trans_private) { struct rds_ib_mr *ibmr = trans_private; return ibmr->u.mr->lkey; } void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret, struct rds_connection *conn, u64 start, u64 length, int need_odp) { struct rds_ib_device *rds_ibdev; struct rds_ib_mr *ibmr = NULL; struct rds_ib_connection *ic = NULL; int ret; rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]); if (!rds_ibdev) { ret = -ENODEV; goto out; } if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) { u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start; int access_flags = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_ON_DEMAND); struct ib_sge sge = {}; struct ib_mr *ib_mr; if (!rds_ibdev->odp_capable) { ret = -EOPNOTSUPP; goto out; } ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr, access_flags); if (IS_ERR(ib_mr)) { rdsdebug("rds_ib_get_user_mr returned %d\n", IS_ERR(ib_mr)); ret = PTR_ERR(ib_mr); goto out; } if (key_ret) *key_ret = ib_mr->rkey; ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); if (!ibmr) { ib_dereg_mr(ib_mr); ret = -ENOMEM; goto out; } ibmr->u.mr = ib_mr; ibmr->odp = 1; sge.addr = virt_addr; sge.length = length; sge.lkey = ib_mr->lkey; ib_advise_mr(rds_ibdev->pd, IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE, IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1); return ibmr; } if (conn) ic = conn->c_transport_data; if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) { ret = -ENODEV; goto out; } ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); if (IS_ERR(ibmr)) { ret = PTR_ERR(ibmr); pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret); } else { return ibmr; } out: if (rds_ibdev) rds_ib_dev_put(rds_ibdev); return ERR_PTR(ret); } void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) { cancel_delayed_work_sync(&pool->flush_worker); rds_ib_flush_mr_pool(pool, 1, NULL); WARN_ON(atomic_read(&pool->item_count)); WARN_ON(atomic_read(&pool->free_pinned)); kfree(pool); } struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, int pool_type) { struct rds_ib_mr_pool *pool; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return ERR_PTR(-ENOMEM); pool->pool_type = pool_type; init_llist_head(&pool->free_list); init_llist_head(&pool->drop_list); init_llist_head(&pool->clean_list); spin_lock_init(&pool->clean_lock); mutex_init(&pool->flush_lock); init_waitqueue_head(&pool->flush_wait); INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); if (pool_type == RDS_IB_MR_1M_POOL) { /* +1 allows for unaligned MRs */ pool->max_pages = RDS_MR_1M_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_1m_mrs; } else { /* pool_type == RDS_IB_MR_8K_POOL */ pool->max_pages = RDS_MR_8K_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_8k_mrs; } pool->max_free_pinned = pool->max_items * pool->max_pages / 4; pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4; return pool; } int rds_ib_mr_init(void) { rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0); if (!rds_ib_mr_wq) return -ENOMEM; return 0; } /* By the time this is called all the IB devices should have been torn down and * had their pools freed. As each pool is freed its work struct is waited on, * so the pool flushing work queue should be idle by the time we get here. */ void rds_ib_mr_exit(void) { destroy_workqueue(rds_ib_mr_wq); } static void rds_ib_odp_mr_worker(struct work_struct *work) { struct rds_ib_mr *ibmr; ibmr = container_of(work, struct rds_ib_mr, work.work); ib_dereg_mr(ibmr->u.mr); kfree(ibmr); } |
555 229 308 231 170 73 164 46 331 218 250 194 307 310 1 170 401 12 29 39 396 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | // SPDX-License-Identifier: GPL-2.0-only #include <net/tcp.h> /* The bandwidth estimator estimates the rate at which the network * can currently deliver outbound data packets for this flow. At a high * level, it operates by taking a delivery rate sample for each ACK. * * A rate sample records the rate at which the network delivered packets * for this flow, calculated over the time interval between the transmission * of a data packet and the acknowledgment of that packet. * * Specifically, over the interval between each transmit and corresponding ACK, * the estimator generates a delivery rate sample. Typically it uses the rate * at which packets were acknowledged. However, the approach of using only the * acknowledgment rate faces a challenge under the prevalent ACK decimation or * compression: packets can temporarily appear to be delivered much quicker * than the bottleneck rate. Since it is physically impossible to do that in a * sustained fashion, when the estimator notices that the ACK rate is faster * than the transmit rate, it uses the latter: * * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) * bw = min(send_rate, ack_rate) * * Notice the estimator essentially estimates the goodput, not always the * network bottleneck link rate when the sending or receiving is limited by * other factors like applications or receiver window limits. The estimator * deliberately avoids using the inter-packet spacing approach because that * approach requires a large number of samples and sophisticated filtering. * * TCP flows can often be application-limited in request/response workloads. * The estimator marks a bandwidth sample as application-limited if there * was some moment during the sampled window of packets when there was no data * ready to send in the write queue. */ /* Snapshot the current delivery information in the skb, to generate * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). */ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); /* In general we need to start delivery rate samples from the * time we received the most recent ACK, to ensure we include * the full time the network needs to deliver all in-flight * packets. If there are no packets in flight yet, then we * know that any ACKs after now indicate that the network was * able to deliver those packets completely in the sampling * interval between now and the next ACK. * * Note that we use packets_out instead of tcp_packets_in_flight(tp) * because the latter is a guess based on RTO and loss-marking * heuristics. We don't want spurious RTOs or loss markings to cause * a spuriously small time interval, causing a spuriously high * bandwidth estimate. */ if (!tp->packets_out) { u64 tstamp_us = tcp_skb_timestamp_us(skb); tp->first_tx_mstamp = tstamp_us; tp->delivered_mstamp = tstamp_us; } TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; TCP_SKB_CB(skb)->tx.delivered = tp->delivered; TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; } /* When an skb is sacked or acked, we fill in the rate sample with the (prior) * delivery information when the skb was last transmitted. * * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is * called multiple times. We favor the information from the most recently * sent skb, i.e., the skb with the most recently sent time and the highest * sequence. */ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u64 tx_tstamp; if (!scb->tx.delivered_mstamp) return; tx_tstamp = tcp_skb_timestamp_us(skb); if (!rs->prior_delivered || tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, scb->end_seq, rs->last_end_seq)) { rs->prior_delivered_ce = scb->tx.delivered_ce; rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; rs->last_end_seq = scb->end_seq; /* Record send time of most recently ACKed packet: */ tp->first_tx_mstamp = tx_tstamp; /* Find the duration of the "send phase" of this window: */ rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, scb->tx.first_tx_mstamp); } /* Mark off the skb delivered once it's sacked to avoid being * used again when it's cumulatively acked. For acked packets * we don't need to reset since it'll be freed soon. */ if (scb->sacked & TCPCB_SACKED_ACKED) scb->tx.delivered_mstamp = 0; } /* Update the connection delivery information and generate a rate sample. */ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, bool is_sack_reneg, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; /* Clear app limited if bubble is acked and gone. */ if (tp->app_limited && after(tp->delivered, tp->app_limited)) tp->app_limited = 0; /* TODO: there are multiple places throughout tcp_ack() to get * current time. Refactor the code using a new "tcp_acktag_state" * to carry current time, flags, stats like "tcp_sacktag_state". */ if (delivered) tp->delivered_mstamp = tp->tcp_mstamp; rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ /* Return an invalid sample if no timing information is available or * in recovery from loss with SACK reneging. Rate samples taken during * a SACK reneging event may overestimate bw by including packets that * were SACKed before the reneg. */ if (!rs->prior_mstamp || is_sack_reneg) { rs->delivered = -1; rs->interval_us = -1; return; } rs->delivered = tp->delivered - rs->prior_delivered; rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; /* delivered_ce occupies less than 32 bits in the skb control block */ rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; /* Model sending data and receiving ACKs as separate pipeline phases * for a window. Usually the ACK phase is longer, but with ACK * compression the send phase can be longer. To be safe we use the * longer phase. */ snd_us = rs->interval_us; /* send phase */ ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); /* Record both segment send and ack receive intervals */ rs->snd_interval_us = snd_us; rs->rcv_interval_us = ack_us; /* Normally we expect interval_us >= min-rtt. * Note that rate may still be over-estimated when a spuriously * retransmistted skb was first (s)acked because "interval_us" * is under-estimated (up to an RTT). However continuously * measuring the delivery rate during loss recovery is crucial * for connections suffer heavy or prolonged losses. */ if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { if (!rs->is_retrans) pr_debug("tcp rate: %ld %d %u %u %u\n", rs->interval_us, rs->delivered, inet_csk(sk)->icsk_ca_state, tp->rx_opt.sack_ok, tcp_min_rtt(tp)); rs->interval_us = -1; return; } /* Record the last non-app-limited or the highest app-limited bw */ if (!rs->is_app_limited || ((u64)rs->delivered * tp->rate_interval_us >= (u64)tp->rate_delivered * rs->interval_us)) { tp->rate_delivered = rs->delivered; tp->rate_interval_us = rs->interval_us; tp->rate_app_limited = rs->is_app_limited; } } /* If a gap is detected between sends, mark the socket application-limited. */ void tcp_rate_check_app_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (/* We have less than one packet to send. */ tp->write_seq - tp->snd_nxt < tp->mss_cache && /* Nothing in sending host's qdisc queues or NIC tx queue. */ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && /* We are not limited by CWND. */ tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) && /* All lost packets have been retransmitted. */ tp->lost_out <= tp->retrans_out) tp->app_limited = (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; } EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited); |
4317 4315 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 | // SPDX-License-Identifier: GPL-2.0-only /* * ACPI device specific properties support. * * Copyright (C) 2014 - 2023, Intel Corporation * All rights reserved. * * Authors: Mika Westerberg <mika.westerberg@linux.intel.com> * Darren Hart <dvhart@linux.intel.com> * Rafael J. Wysocki <rafael.j.wysocki@intel.com> * Sakari Ailus <sakari.ailus@linux.intel.com> */ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/acpi.h> #include <linux/device.h> #include <linux/export.h> #include "internal.h" static int acpi_data_get_property_array(const struct acpi_device_data *data, const char *name, acpi_object_type type, const union acpi_object **obj); /* * The GUIDs here are made equivalent to each other in order to avoid extra * complexity in the properties handling code, with the caveat that the * kernel will accept certain combinations of GUID and properties that are * not defined without a warning. For instance if any of the properties * from different GUID appear in a property list of another, it will be * accepted by the kernel. Firmware validation tools should catch these. * * References: * * [1] UEFI DSD Guide. * https://github.com/UEFI/DSD-Guide/blob/main/src/dsd-guide.adoc */ static const guid_t prp_guids[] = { /* ACPI _DSD device properties GUID [1]: daffd814-6eba-4d8c-8a91-bc9bbf4aa301 */ GUID_INIT(0xdaffd814, 0x6eba, 0x4d8c, 0x8a, 0x91, 0xbc, 0x9b, 0xbf, 0x4a, 0xa3, 0x01), /* Hotplug in D3 GUID: 6211e2c0-58a3-4af3-90e1-927a4e0c55a4 */ GUID_INIT(0x6211e2c0, 0x58a3, 0x4af3, 0x90, 0xe1, 0x92, 0x7a, 0x4e, 0x0c, 0x55, 0xa4), /* External facing port GUID: efcc06cc-73ac-4bc3-bff0-76143807c389 */ GUID_INIT(0xefcc06cc, 0x73ac, 0x4bc3, 0xbf, 0xf0, 0x76, 0x14, 0x38, 0x07, 0xc3, 0x89), /* Thunderbolt GUID for IMR_VALID: c44d002f-69f9-4e7d-a904-a7baabdf43f7 */ GUID_INIT(0xc44d002f, 0x69f9, 0x4e7d, 0xa9, 0x04, 0xa7, 0xba, 0xab, 0xdf, 0x43, 0xf7), /* Thunderbolt GUID for WAKE_SUPPORTED: 6c501103-c189-4296-ba72-9bf5a26ebe5d */ GUID_INIT(0x6c501103, 0xc189, 0x4296, 0xba, 0x72, 0x9b, 0xf5, 0xa2, 0x6e, 0xbe, 0x5d), /* Storage device needs D3 GUID: 5025030f-842f-4ab4-a561-99a5189762d0 */ GUID_INIT(0x5025030f, 0x842f, 0x4ab4, 0xa5, 0x61, 0x99, 0xa5, 0x18, 0x97, 0x62, 0xd0), }; /* ACPI _DSD data subnodes GUID [1]: dbb8e3e6-5886-4ba6-8795-1319f52a966b */ static const guid_t ads_guid = GUID_INIT(0xdbb8e3e6, 0x5886, 0x4ba6, 0x87, 0x95, 0x13, 0x19, 0xf5, 0x2a, 0x96, 0x6b); /* ACPI _DSD data buffer GUID [1]: edb12dd0-363d-4085-a3d2-49522ca160c4 */ static const guid_t buffer_prop_guid = GUID_INIT(0xedb12dd0, 0x363d, 0x4085, 0xa3, 0xd2, 0x49, 0x52, 0x2c, 0xa1, 0x60, 0xc4); static bool acpi_enumerate_nondev_subnodes(acpi_handle scope, union acpi_object *desc, struct acpi_device_data *data, struct fwnode_handle *parent); static bool acpi_extract_properties(acpi_handle handle, union acpi_object *desc, struct acpi_device_data *data); static bool acpi_nondev_subnode_extract(union acpi_object *desc, acpi_handle handle, const union acpi_object *link, struct list_head *list, struct fwnode_handle *parent) { struct acpi_data_node *dn; bool result; if (acpi_graph_ignore_port(handle)) return false; dn = kzalloc(sizeof(*dn), GFP_KERNEL); if (!dn) return false; dn->name = link->package.elements[0].string.pointer; fwnode_init(&dn->fwnode, &acpi_data_fwnode_ops); dn->parent = parent; INIT_LIST_HEAD(&dn->data.properties); INIT_LIST_HEAD(&dn->data.subnodes); result = acpi_extract_properties(handle, desc, &dn->data); if (handle) { acpi_handle scope; acpi_status status; /* * The scope for the subnode object lookup is the one of the * namespace node (device) containing the object that has * returned the package. That is, it's the scope of that * object's parent. */ status = acpi_get_parent(handle, &scope); if (ACPI_SUCCESS(status) && acpi_enumerate_nondev_subnodes(scope, desc, &dn->data, &dn->fwnode)) result = true; } else if (acpi_enumerate_nondev_subnodes(NULL, desc, &dn->data, &dn->fwnode)) { result = true; } if (result) { dn->handle = handle; dn->data.pointer = desc; list_add_tail(&dn->sibling, list); return true; } kfree(dn); acpi_handle_debug(handle, "Invalid properties/subnodes data, skipping\n"); return false; } static bool acpi_nondev_subnode_data_ok(acpi_handle handle, const union acpi_object *link, struct list_head *list, struct fwnode_handle *parent) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; acpi_status status; status = acpi_evaluate_object_typed(handle, NULL, NULL, &buf, ACPI_TYPE_PACKAGE); if (ACPI_FAILURE(status)) return false; if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list, parent)) return true; ACPI_FREE(buf.pointer); return false; } static bool acpi_nondev_subnode_ok(acpi_handle scope, const union acpi_object *link, struct list_head *list, struct fwnode_handle *parent) { acpi_handle handle; acpi_status status; if (!scope) return false; status = acpi_get_handle(scope, link->package.elements[1].string.pointer, &handle); if (ACPI_FAILURE(status)) return false; return acpi_nondev_subnode_data_ok(handle, link, list, parent); } static bool acpi_add_nondev_subnodes(acpi_handle scope, union acpi_object *links, struct list_head *list, struct fwnode_handle *parent) { bool ret = false; int i; for (i = 0; i < links->package.count; i++) { union acpi_object *link, *desc; acpi_handle handle; bool result; link = &links->package.elements[i]; /* Only two elements allowed. */ if (link->package.count != 2) continue; /* The first one must be a string. */ if (link->package.elements[0].type != ACPI_TYPE_STRING) continue; /* The second one may be a string, a reference or a package. */ switch (link->package.elements[1].type) { case ACPI_TYPE_STRING: result = acpi_nondev_subnode_ok(scope, link, list, parent); break; case ACPI_TYPE_LOCAL_REFERENCE: handle = link->package.elements[1].reference.handle; result = acpi_nondev_subnode_data_ok(handle, link, list, parent); break; case ACPI_TYPE_PACKAGE: desc = &link->package.elements[1]; result = acpi_nondev_subnode_extract(desc, NULL, link, list, parent); break; default: result = false; break; } ret = ret || result; } return ret; } static bool acpi_enumerate_nondev_subnodes(acpi_handle scope, union acpi_object *desc, struct acpi_device_data *data, struct fwnode_handle *parent) { int i; /* Look for the ACPI data subnodes GUID. */ for (i = 0; i < desc->package.count; i += 2) { const union acpi_object *guid; union acpi_object *links; guid = &desc->package.elements[i]; links = &desc->package.elements[i + 1]; /* * The first element must be a GUID and the second one must be * a package. */ if (guid->type != ACPI_TYPE_BUFFER || guid->buffer.length != 16 || links->type != ACPI_TYPE_PACKAGE) break; if (!guid_equal((guid_t *)guid->buffer.pointer, &ads_guid)) continue; return acpi_add_nondev_subnodes(scope, links, &data->subnodes, parent); } return false; } static bool acpi_property_value_ok(const union acpi_object *value) { int j; /* * The value must be an integer, a string, a reference, or a package * whose every element must be an integer, a string, or a reference. */ switch (value->type) { case ACPI_TYPE_INTEGER: case ACPI_TYPE_STRING: case ACPI_TYPE_LOCAL_REFERENCE: return true; case ACPI_TYPE_PACKAGE: for (j = 0; j < value->package.count; j++) switch (value->package.elements[j].type) { case ACPI_TYPE_INTEGER: case ACPI_TYPE_STRING: case ACPI_TYPE_LOCAL_REFERENCE: continue; default: return false; } return true; } return false; } static bool acpi_properties_format_valid(const union acpi_object *properties) { int i; for (i = 0; i < properties->package.count; i++) { const union acpi_object *property; property = &properties->package.elements[i]; /* * Only two elements allowed, the first one must be a string and * the second one has to satisfy certain conditions. */ if (property->package.count != 2 || property->package.elements[0].type != ACPI_TYPE_STRING || !acpi_property_value_ok(&property->package.elements[1])) return false; } return true; } static void acpi_init_of_compatible(struct acpi_device *adev) { const union acpi_object *of_compatible; int ret; ret = acpi_data_get_property_array(&adev->data, "compatible", ACPI_TYPE_STRING, &of_compatible); if (ret) { ret = acpi_dev_get_property(adev, "compatible", ACPI_TYPE_STRING, &of_compatible); if (ret) { struct acpi_device *parent; parent = acpi_dev_parent(adev); if (parent && parent->flags.of_compatible_ok) goto out; return; } } adev->data.of_compatible = of_compatible; out: adev->flags.of_compatible_ok = 1; } static bool acpi_is_property_guid(const guid_t *guid) { int i; for (i = 0; i < ARRAY_SIZE(prp_guids); i++) { if (guid_equal(guid, &prp_guids[i])) return true; } return false; } struct acpi_device_properties * acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid, union acpi_object *properties) { struct acpi_device_properties *props; props = kzalloc(sizeof(*props), GFP_KERNEL); if (props) { INIT_LIST_HEAD(&props->list); props->guid = guid; props->properties = properties; list_add_tail(&props->list, &data->properties); } return props; } static void acpi_nondev_subnode_tag(acpi_handle handle, void *context) { } static void acpi_untie_nondev_subnodes(struct acpi_device_data *data) { struct acpi_data_node *dn; list_for_each_entry(dn, &data->subnodes, sibling) { acpi_detach_data(dn->handle, acpi_nondev_subnode_tag); acpi_untie_nondev_subnodes(&dn->data); } } static bool acpi_tie_nondev_subnodes(struct acpi_device_data *data) { struct acpi_data_node *dn; list_for_each_entry(dn, &data->subnodes, sibling) { acpi_status status; bool ret; status = acpi_attach_data(dn->handle, acpi_nondev_subnode_tag, dn); if (ACPI_FAILURE(status) && status != AE_ALREADY_EXISTS) { acpi_handle_err(dn->handle, "Can't tag data node\n"); return false; } ret = acpi_tie_nondev_subnodes(&dn->data); if (!ret) return ret; } return true; } static void acpi_data_add_buffer_props(acpi_handle handle, struct acpi_device_data *data, union acpi_object *properties) { struct acpi_device_properties *props; union acpi_object *package; size_t alloc_size; unsigned int i; u32 *count; if (check_mul_overflow((size_t)properties->package.count, sizeof(*package) + sizeof(void *), &alloc_size) || check_add_overflow(sizeof(*props) + sizeof(*package), alloc_size, &alloc_size)) { acpi_handle_warn(handle, "can't allocate memory for %u buffer props", properties->package.count); return; } props = kvzalloc(alloc_size, GFP_KERNEL); if (!props) return; props->guid = &buffer_prop_guid; props->bufs = (void *)(props + 1); props->properties = (void *)(props->bufs + properties->package.count); /* Outer package */ package = props->properties; package->type = ACPI_TYPE_PACKAGE; package->package.elements = package + 1; count = &package->package.count; *count = 0; /* Inner packages */ package++; for (i = 0; i < properties->package.count; i++) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; union acpi_object *property = &properties->package.elements[i]; union acpi_object *prop, *obj, *buf_obj; acpi_status status; if (property->type != ACPI_TYPE_PACKAGE || property->package.count != 2) { acpi_handle_warn(handle, "buffer property %u has %u entries\n", i, property->package.count); continue; } prop = &property->package.elements[0]; obj = &property->package.elements[1]; if (prop->type != ACPI_TYPE_STRING || obj->type != ACPI_TYPE_STRING) { acpi_handle_warn(handle, "wrong object types %u and %u\n", prop->type, obj->type); continue; } status = acpi_evaluate_object_typed(handle, obj->string.pointer, NULL, &buf, ACPI_TYPE_BUFFER); if (ACPI_FAILURE(status)) { acpi_handle_warn(handle, "can't evaluate \"%*pE\" as buffer\n", obj->string.length, obj->string.pointer); continue; } package->type = ACPI_TYPE_PACKAGE; package->package.elements = prop; package->package.count = 2; buf_obj = buf.pointer; /* Replace the string object with a buffer object */ obj->type = ACPI_TYPE_BUFFER; obj->buffer.length = buf_obj->buffer.length; obj->buffer.pointer = buf_obj->buffer.pointer; props->bufs[i] = buf.pointer; package++; (*count)++; } if (*count) list_add(&props->list, &data->properties); else kvfree(props); } static bool acpi_extract_properties(acpi_handle scope, union acpi_object *desc, struct acpi_device_data *data) { int i; if (desc->package.count % 2) return false; /* Look for the device properties GUID. */ for (i = 0; i < desc->package.count; i += 2) { const union acpi_object *guid; union acpi_object *properties; guid = &desc->package.elements[i]; properties = &desc->package.elements[i + 1]; /* * The first element must be a GUID and the second one must be * a package. */ if (guid->type != ACPI_TYPE_BUFFER || guid->buffer.length != 16 || properties->type != ACPI_TYPE_PACKAGE) break; if (guid_equal((guid_t *)guid->buffer.pointer, &buffer_prop_guid)) { acpi_data_add_buffer_props(scope, data, properties); continue; } if (!acpi_is_property_guid((guid_t *)guid->buffer.pointer)) continue; /* * We found the matching GUID. Now validate the format of the * package immediately following it. */ if (!acpi_properties_format_valid(properties)) continue; acpi_data_add_props(data, (const guid_t *)guid->buffer.pointer, properties); } return !list_empty(&data->properties); } void acpi_init_properties(struct acpi_device *adev) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER }; struct acpi_hardware_id *hwid; acpi_status status; bool acpi_of = false; INIT_LIST_HEAD(&adev->data.properties); INIT_LIST_HEAD(&adev->data.subnodes); if (!adev->handle) return; /* * Check if ACPI_DT_NAMESPACE_HID is present and inthat case we fill in * Device Tree compatible properties for this device. */ list_for_each_entry(hwid, &adev->pnp.ids, list) { if (!strcmp(hwid->id, ACPI_DT_NAMESPACE_HID)) { acpi_of = true; break; } } status = acpi_evaluate_object_typed(adev->handle, "_DSD", NULL, &buf, ACPI_TYPE_PACKAGE); if (ACPI_FAILURE(status)) goto out; if (acpi_extract_properties(adev->handle, buf.pointer, &adev->data)) { adev->data.pointer = buf.pointer; if (acpi_of) acpi_init_of_compatible(adev); } if (acpi_enumerate_nondev_subnodes(adev->handle, buf.pointer, &adev->data, acpi_fwnode_handle(adev))) adev->data.pointer = buf.pointer; if (!adev->data.pointer) { acpi_handle_debug(adev->handle, "Invalid _DSD data, skipping\n"); ACPI_FREE(buf.pointer); } else { if (!acpi_tie_nondev_subnodes(&adev->data)) acpi_untie_nondev_subnodes(&adev->data); } out: if (acpi_of && !adev->flags.of_compatible_ok) acpi_handle_info(adev->handle, ACPI_DT_NAMESPACE_HID " requires 'compatible' property\n"); if (!adev->data.pointer) acpi_extract_apple_properties(adev); } static void acpi_free_device_properties(struct list_head *list) { struct acpi_device_properties *props, *tmp; list_for_each_entry_safe(props, tmp, list, list) { u32 i; list_del(&props->list); /* Buffer data properties were separately allocated */ if (props->bufs) for (i = 0; i < props->properties->package.count; i++) ACPI_FREE(props->bufs[i]); kvfree(props); } } static void acpi_destroy_nondev_subnodes(struct list_head *list) { struct acpi_data_node *dn, *next; if (list_empty(list)) return; list_for_each_entry_safe_reverse(dn, next, list, sibling) { acpi_destroy_nondev_subnodes(&dn->data.subnodes); wait_for_completion(&dn->kobj_done); list_del(&dn->sibling); ACPI_FREE((void *)dn->data.pointer); acpi_free_device_properties(&dn->data.properties); kfree(dn); } } void acpi_free_properties(struct acpi_device *adev) { acpi_untie_nondev_subnodes(&adev->data); acpi_destroy_nondev_subnodes(&adev->data.subnodes); ACPI_FREE((void *)adev->data.pointer); adev->data.of_compatible = NULL; adev->data.pointer = NULL; acpi_free_device_properties(&adev->data.properties); } /** * acpi_data_get_property - return an ACPI property with given name * @data: ACPI device deta object to get the property from * @name: Name of the property * @type: Expected property type * @obj: Location to store the property value (if not %NULL) * * Look up a property with @name and store a pointer to the resulting ACPI * object at the location pointed to by @obj if found. * * Callers must not attempt to free the returned objects. These objects will be * freed by the ACPI core automatically during the removal of @data. * * Return: %0 if property with @name has been found (success), * %-EINVAL if the arguments are invalid, * %-EINVAL if the property doesn't exist, * %-EPROTO if the property value type doesn't match @type. */ static int acpi_data_get_property(const struct acpi_device_data *data, const char *name, acpi_object_type type, const union acpi_object **obj) { const struct acpi_device_properties *props; if (!data || !name) return -EINVAL; if (!data->pointer || list_empty(&data->properties)) return -EINVAL; list_for_each_entry(props, &data->properties, list) { const union acpi_object *properties; unsigned int i; properties = props->properties; for (i = 0; i < properties->package.count; i++) { const union acpi_object *propname, *propvalue; const union acpi_object *property; property = &properties->package.elements[i]; propname = &property->package.elements[0]; propvalue = &property->package.elements[1]; if (!strcmp(name, propname->string.pointer)) { if (type != ACPI_TYPE_ANY && propvalue->type != type) return -EPROTO; if (obj) *obj = propvalue; return 0; } } } return -EINVAL; } /** * acpi_dev_get_property - return an ACPI property with given name. * @adev: ACPI device to get the property from. * @name: Name of the property. * @type: Expected property type. * @obj: Location to store the property value (if not %NULL). */ int acpi_dev_get_property(const struct acpi_device *adev, const char *name, acpi_object_type type, const union acpi_object **obj) { return adev ? acpi_data_get_property(&adev->data, name, type, obj) : -EINVAL; } EXPORT_SYMBOL_GPL(acpi_dev_get_property); static const struct acpi_device_data * acpi_device_data_of_node(const struct fwnode_handle *fwnode) { if (is_acpi_device_node(fwnode)) { const struct acpi_device *adev = to_acpi_device_node(fwnode); return &adev->data; } if (is_acpi_data_node(fwnode)) { const struct acpi_data_node *dn = to_acpi_data_node(fwnode); return &dn->data; } return NULL; } /** * acpi_node_prop_get - return an ACPI property with given name. * @fwnode: Firmware node to get the property from. * @propname: Name of the property. * @valptr: Location to store a pointer to the property value (if not %NULL). */ int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr) { return acpi_data_get_property(acpi_device_data_of_node(fwnode), propname, ACPI_TYPE_ANY, (const union acpi_object **)valptr); } /** * acpi_data_get_property_array - return an ACPI array property with given name * @data: ACPI data object to get the property from * @name: Name of the property * @type: Expected type of array elements * @obj: Location to store a pointer to the property value (if not NULL) * * Look up an array property with @name and store a pointer to the resulting * ACPI object at the location pointed to by @obj if found. * * Callers must not attempt to free the returned objects. Those objects will be * freed by the ACPI core automatically during the removal of @data. * * Return: %0 if array property (package) with @name has been found (success), * %-EINVAL if the arguments are invalid, * %-EINVAL if the property doesn't exist, * %-EPROTO if the property is not a package or the type of its elements * doesn't match @type. */ static int acpi_data_get_property_array(const struct acpi_device_data *data, const char *name, acpi_object_type type, const union acpi_object **obj) { const union acpi_object *prop; int ret, i; ret = acpi_data_get_property(data, name, ACPI_TYPE_PACKAGE, &prop); if (ret) return ret; if (type != ACPI_TYPE_ANY) { /* Check that all elements are of correct type. */ for (i = 0; i < prop->package.count; i++) if (prop->package.elements[i].type != type) return -EPROTO; } if (obj) *obj = prop; return 0; } static struct fwnode_handle * acpi_fwnode_get_named_child_node(const struct fwnode_handle *fwnode, const char *childname) { struct fwnode_handle *child; fwnode_for_each_child_node(fwnode, child) { if (is_acpi_data_node(child)) { if (acpi_data_node_match(child, childname)) return child; continue; } if (!strncmp(acpi_device_bid(to_acpi_device_node(child)), childname, ACPI_NAMESEG_SIZE)) return child; } return NULL; } static int acpi_get_ref_args(struct fwnode_reference_args *args, struct fwnode_handle *ref_fwnode, const union acpi_object **element, const union acpi_object *end, size_t num_args) { u32 nargs = 0, i; /* * Assume the following integer elements are all args. Stop counting on * the first reference (possibly represented as a string) or end of the * package arguments. In case of neither reference, nor integer, return * an error, we can't parse it. */ for (i = 0; (*element) + i < end && i < num_args; i++) { acpi_object_type type = (*element)[i].type; if (type == ACPI_TYPE_LOCAL_REFERENCE || type == ACPI_TYPE_STRING) break; if (type == ACPI_TYPE_INTEGER) nargs++; else return -EINVAL; } if (nargs > NR_FWNODE_REFERENCE_ARGS) return -EINVAL; if (args) { args->fwnode = ref_fwnode; args->nargs = nargs; for (i = 0; i < nargs; i++) args->args[i] = (*element)[i].integer.value; } (*element) += nargs; return 0; } static struct fwnode_handle *acpi_parse_string_ref(const struct fwnode_handle *fwnode, const char *refstring) { acpi_handle scope, handle; struct acpi_data_node *dn; struct acpi_device *device; acpi_status status; if (is_acpi_device_node(fwnode)) { scope = to_acpi_device_node(fwnode)->handle; } else if (is_acpi_data_node(fwnode)) { scope = to_acpi_data_node(fwnode)->handle; } else { pr_debug("Bad node type for node %pfw\n", fwnode); return NULL; } status = acpi_get_handle(scope, refstring, &handle); if (ACPI_FAILURE(status)) { acpi_handle_debug(scope, "Unable to get an ACPI handle for %s\n", refstring); return NULL; } device = acpi_fetch_acpi_dev(handle); if (device) return acpi_fwnode_handle(device); status = acpi_get_data_full(handle, acpi_nondev_subnode_tag, (void **)&dn, NULL); if (ACPI_FAILURE(status) || !dn) { acpi_handle_debug(handle, "Subnode not found\n"); return NULL; } return &dn->fwnode; } /** * __acpi_node_get_property_reference - returns handle to the referenced object * @fwnode: Firmware node to get the property from * @propname: Name of the property * @index: Index of the reference to return * @num_args: Maximum number of arguments after each reference * @args: Location to store the returned reference with optional arguments * (may be NULL) * * Find property with @name, verifify that it is a package containing at least * one object reference and if so, store the ACPI device object pointer to the * target object in @args->adev. If the reference includes arguments, store * them in the @args->args[] array. * * If there's more than one reference in the property value package, @index is * used to select the one to return. * * It is possible to leave holes in the property value set like in the * example below: * * Package () { * "cs-gpios", * Package () { * ^GPIO, 19, 0, 0, * ^GPIO, 20, 0, 0, * 0, * ^GPIO, 21, 0, 0, * } * } * * Calling this function with index %2 or index %3 return %-ENOENT. If the * property does not contain any more values %-ENOENT is returned. The NULL * entry must be single integer and preferably contain value %0. * * Return: %0 on success, negative error code on failure. */ int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, const char *propname, size_t index, size_t num_args, struct fwnode_reference_args *args) { const union acpi_object *element, *end; const union acpi_object *obj; const struct acpi_device_data *data; struct fwnode_handle *ref_fwnode; struct acpi_device *device; int ret, idx = 0; data = acpi_device_data_of_node(fwnode); if (!data) return -ENOENT; ret = acpi_data_get_property(data, propname, ACPI_TYPE_ANY, &obj); if (ret) return ret == -EINVAL ? -ENOENT : -EINVAL; switch (obj->type) { case ACPI_TYPE_LOCAL_REFERENCE: /* Plain single reference without arguments. */ if (index) return -ENOENT; device = acpi_fetch_acpi_dev(obj->reference.handle); if (!device) return -EINVAL; if (!args) return 0; args->fwnode = acpi_fwnode_handle(device); args->nargs = 0; return 0; case ACPI_TYPE_STRING: if (index) return -ENOENT; ref_fwnode = acpi_parse_string_ref(fwnode, obj->string.pointer); if (!ref_fwnode) return -EINVAL; args->fwnode = ref_fwnode; args->nargs = 0; return 0; case ACPI_TYPE_PACKAGE: /* * If it is not a single reference, then it is a package of * references, followed by number of ints as follows: * * Package () { REF, INT, REF, INT, INT } * * Here, REF may be either a local reference or a string. The * index argument is then used to determine which reference the * caller wants (along with the arguments). */ break; default: return -EINVAL; } if (index >= obj->package.count) return -ENOENT; element = obj->package.elements; end = element + obj->package.count; while (element < end) { switch (element->type) { case ACPI_TYPE_LOCAL_REFERENCE: device = acpi_fetch_acpi_dev(element->reference.handle); if (!device) return -EINVAL; element++; ret = acpi_get_ref_args(idx == index ? args : NULL, acpi_fwnode_handle(device), &element, end, num_args); if (ret < 0) return ret; if (idx == index) return 0; break; case ACPI_TYPE_STRING: ref_fwnode = acpi_parse_string_ref(fwnode, element->string.pointer); if (!ref_fwnode) return -EINVAL; element++; ret = acpi_get_ref_args(idx == index ? args : NULL, ref_fwnode, &element, end, num_args); if (ret < 0) return ret; if (idx == index) return 0; break; case ACPI_TYPE_INTEGER: if (idx == index) return -ENOENT; element++; break; default: return -EINVAL; } idx++; } return -ENOENT; } EXPORT_SYMBOL_GPL(__acpi_node_get_property_reference); static int acpi_data_prop_read_single(const struct acpi_device_data *data, const char *propname, enum dev_prop_type proptype, void *val) { const union acpi_object *obj; int ret = 0; if (proptype >= DEV_PROP_U8 && proptype <= DEV_PROP_U64) ret = acpi_data_get_property(data, propname, ACPI_TYPE_INTEGER, &obj); else if (proptype == DEV_PROP_STRING) ret = acpi_data_get_property(data, propname, ACPI_TYPE_STRING, &obj); if (ret) return ret; switch (proptype) { case DEV_PROP_U8: if (obj->integer.value > U8_MAX) return -EOVERFLOW; if (val) *(u8 *)val = obj->integer.value; break; case DEV_PROP_U16: if (obj->integer.value > U16_MAX) return -EOVERFLOW; if (val) *(u16 *)val = obj->integer.value; break; case DEV_PROP_U32: if (obj->integer.value > U32_MAX) return -EOVERFLOW; if (val) *(u32 *)val = obj->integer.value; break; case DEV_PROP_U64: if (val) *(u64 *)val = obj->integer.value; break; case DEV_PROP_STRING: if (val) *(char **)val = obj->string.pointer; return 1; default: return -EINVAL; } /* When no storage provided return number of available values */ return val ? 0 : 1; } #define acpi_copy_property_array_uint(items, val, nval) \ ({ \ typeof(items) __items = items; \ typeof(val) __val = val; \ typeof(nval) __nval = nval; \ size_t i; \ int ret = 0; \ \ for (i = 0; i < __nval; i++) { \ if (__items->type == ACPI_TYPE_BUFFER) { \ __val[i] = __items->buffer.pointer[i]; \ continue; \ } \ if (__items[i].type != ACPI_TYPE_INTEGER) { \ ret = -EPROTO; \ break; \ } \ if (__items[i].integer.value > _Generic(__val, \ u8 *: U8_MAX, \ u16 *: U16_MAX, \ u32 *: U32_MAX, \ u64 *: U64_MAX)) { \ ret = -EOVERFLOW; \ break; \ } \ \ __val[i] = __items[i].integer.value; \ } \ ret; \ }) static int acpi_copy_property_array_string(const union acpi_object *items, char **val, size_t nval) { int i; for (i = 0; i < nval; i++) { if (items[i].type != ACPI_TYPE_STRING) return -EPROTO; val[i] = items[i].string.pointer; } return nval; } static int acpi_data_prop_read(const struct acpi_device_data *data, const char *propname, enum dev_prop_type proptype, void *val, size_t nval) { const union acpi_object *obj; const union acpi_object *items; int ret; if (nval == 1 || !val) { ret = acpi_data_prop_read_single(data, propname, proptype, val); /* * The overflow error means that the property is there and it is * single-value, but its type does not match, so return. */ if (ret >= 0 || ret == -EOVERFLOW) return ret; /* * Reading this property as a single-value one failed, but its * value may still be represented as one-element array, so * continue. */ } ret = acpi_data_get_property_array(data, propname, ACPI_TYPE_ANY, &obj); if (ret && proptype >= DEV_PROP_U8 && proptype <= DEV_PROP_U64) ret = acpi_data_get_property(data, propname, ACPI_TYPE_BUFFER, &obj); if (ret) return ret; if (!val) { if (obj->type == ACPI_TYPE_BUFFER) return obj->buffer.length; return obj->package.count; } switch (proptype) { case DEV_PROP_STRING: break; default: if (obj->type == ACPI_TYPE_BUFFER) { if (nval > obj->buffer.length) return -EOVERFLOW; } else { if (nval > obj->package.count) return -EOVERFLOW; } break; } if (obj->type == ACPI_TYPE_BUFFER) { if (proptype != DEV_PROP_U8) return -EPROTO; items = obj; } else { items = obj->package.elements; } switch (proptype) { case DEV_PROP_U8: ret = acpi_copy_property_array_uint(items, (u8 *)val, nval); break; case DEV_PROP_U16: ret = acpi_copy_property_array_uint(items, (u16 *)val, nval); break; case DEV_PROP_U32: ret = acpi_copy_property_array_uint(items, (u32 *)val, nval); break; case DEV_PROP_U64: ret = acpi_copy_property_array_uint(items, (u64 *)val, nval); break; case DEV_PROP_STRING: nval = min_t(u32, nval, obj->package.count); if (nval == 0) return -ENODATA; ret = acpi_copy_property_array_string(items, (char **)val, nval); break; default: ret = -EINVAL; break; } return ret; } /** * acpi_node_prop_read - retrieve the value of an ACPI property with given name. * @fwnode: Firmware node to get the property from. * @propname: Name of the property. * @proptype: Expected property type. * @val: Location to store the property value (if not %NULL). * @nval: Size of the array pointed to by @val. * * If @val is %NULL, return the number of array elements comprising the value * of the property. Otherwise, read at most @nval values to the array at the * location pointed to by @val. */ static int acpi_node_prop_read(const struct fwnode_handle *fwnode, const char *propname, enum dev_prop_type proptype, void *val, size_t nval) { return acpi_data_prop_read(acpi_device_data_of_node(fwnode), propname, proptype, val, nval); } static int stop_on_next(struct acpi_device *adev, void *data) { struct acpi_device **ret_p = data; if (!*ret_p) { *ret_p = adev; return 1; } /* Skip until the "previous" object is found. */ if (*ret_p == adev) *ret_p = NULL; return 0; } /** * acpi_get_next_subnode - Return the next child node handle for a fwnode * @fwnode: Firmware node to find the next child node for. * @child: Handle to one of the device's child nodes or a null handle. */ struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { struct acpi_device *adev = to_acpi_device_node(fwnode); if ((!child || is_acpi_device_node(child)) && adev) { struct acpi_device *child_adev = to_acpi_device_node(child); acpi_dev_for_each_child(adev, stop_on_next, &child_adev); if (child_adev) return acpi_fwnode_handle(child_adev); child = NULL; } if (!child || is_acpi_data_node(child)) { const struct acpi_data_node *data = to_acpi_data_node(fwnode); const struct list_head *head; struct list_head *next; struct acpi_data_node *dn; /* * We can have a combination of device and data nodes, e.g. with * hierarchical _DSD properties. Make sure the adev pointer is * restored before going through data nodes, otherwise we will * be looking for data_nodes below the last device found instead * of the common fwnode shared by device_nodes and data_nodes. */ adev = to_acpi_device_node(fwnode); if (adev) head = &adev->data.subnodes; else if (data) head = &data->data.subnodes; else return NULL; if (list_empty(head)) return NULL; if (child) { dn = to_acpi_data_node(child); next = dn->sibling.next; if (next == head) return NULL; dn = list_entry(next, struct acpi_data_node, sibling); } else { dn = list_first_entry(head, struct acpi_data_node, sibling); } return &dn->fwnode; } return NULL; } /** * acpi_node_get_parent - Return parent fwnode of this fwnode * @fwnode: Firmware node whose parent to get * * Returns parent node of an ACPI device or data firmware node or %NULL if * not available. */ static struct fwnode_handle * acpi_node_get_parent(const struct fwnode_handle *fwnode) { if (is_acpi_data_node(fwnode)) { /* All data nodes have parent pointer so just return that */ return to_acpi_data_node(fwnode)->parent; } if (is_acpi_device_node(fwnode)) { struct acpi_device *parent; parent = acpi_dev_parent(to_acpi_device_node(fwnode)); if (parent) return acpi_fwnode_handle(parent); } return NULL; } /* * Return true if the node is an ACPI graph node. Called on either ports * or endpoints. */ static bool is_acpi_graph_node(struct fwnode_handle *fwnode, const char *str) { unsigned int len = strlen(str); const char *name; if (!len || !is_acpi_data_node(fwnode)) return false; name = to_acpi_data_node(fwnode)->name; return (fwnode_property_present(fwnode, "reg") && !strncmp(name, str, len) && name[len] == '@') || fwnode_property_present(fwnode, str); } /** * acpi_graph_get_next_endpoint - Get next endpoint ACPI firmware node * @fwnode: Pointer to the parent firmware node * @prev: Previous endpoint node or %NULL to get the first * * Looks up next endpoint ACPI firmware node below a given @fwnode. Returns * %NULL if there is no next endpoint or in case of error. In case of success * the next endpoint is returned. */ static struct fwnode_handle *acpi_graph_get_next_endpoint( const struct fwnode_handle *fwnode, struct fwnode_handle *prev) { struct fwnode_handle *port = NULL; struct fwnode_handle *endpoint; if (!prev) { do { port = fwnode_get_next_child_node(fwnode, port); /* * The names of the port nodes begin with "port@" * followed by the number of the port node and they also * have a "reg" property that also has the number of the * port node. For compatibility reasons a node is also * recognised as a port node from the "port" property. */ if (is_acpi_graph_node(port, "port")) break; } while (port); } else { port = fwnode_get_parent(prev); } if (!port) return NULL; endpoint = fwnode_get_next_child_node(port, prev); while (!endpoint) { port = fwnode_get_next_child_node(fwnode, port); if (!port) break; if (is_acpi_graph_node(port, "port")) endpoint = fwnode_get_next_child_node(port, NULL); } /* * The names of the endpoint nodes begin with "endpoint@" followed by * the number of the endpoint node and they also have a "reg" property * that also has the number of the endpoint node. For compatibility * reasons a node is also recognised as an endpoint node from the * "endpoint" property. */ if (!is_acpi_graph_node(endpoint, "endpoint")) return NULL; return endpoint; } /** * acpi_graph_get_child_prop_value - Return a child with a given property value * @fwnode: device fwnode * @prop_name: The name of the property to look for * @val: the desired property value * * Return the port node corresponding to a given port number. Returns * the child node on success, NULL otherwise. */ static struct fwnode_handle *acpi_graph_get_child_prop_value( const struct fwnode_handle *fwnode, const char *prop_name, unsigned int val) { struct fwnode_handle *child; fwnode_for_each_child_node(fwnode, child) { u32 nr; if (fwnode_property_read_u32(child, prop_name, &nr)) continue; if (val == nr) return child; } return NULL; } /** * acpi_graph_get_remote_endpoint - Parses and returns remote end of an endpoint * @__fwnode: Endpoint firmware node pointing to a remote device * * Returns the remote endpoint corresponding to @__fwnode. NULL on error. */ static struct fwnode_handle * acpi_graph_get_remote_endpoint(const struct fwnode_handle *__fwnode) { struct fwnode_handle *fwnode; unsigned int port_nr, endpoint_nr; struct fwnode_reference_args args; int ret; memset(&args, 0, sizeof(args)); ret = acpi_node_get_property_reference(__fwnode, "remote-endpoint", 0, &args); if (ret) return NULL; /* Direct endpoint reference? */ if (!is_acpi_device_node(args.fwnode)) return args.nargs ? NULL : args.fwnode; /* * Always require two arguments with the reference: port and * endpoint indices. */ if (args.nargs != 2) return NULL; fwnode = args.fwnode; port_nr = args.args[0]; endpoint_nr = args.args[1]; fwnode = acpi_graph_get_child_prop_value(fwnode, "port", port_nr); return acpi_graph_get_child_prop_value(fwnode, "endpoint", endpoint_nr); } static bool acpi_fwnode_device_is_available(const struct fwnode_handle *fwnode) { if (!is_acpi_device_node(fwnode)) return true; return acpi_device_is_present(to_acpi_device_node(fwnode)); } static const void * acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode, const struct device *dev) { return acpi_device_get_match_data(dev); } static bool acpi_fwnode_device_dma_supported(const struct fwnode_handle *fwnode) { return acpi_dma_supported(to_acpi_device_node(fwnode)); } static enum dev_dma_attr acpi_fwnode_device_get_dma_attr(const struct fwnode_handle *fwnode) { return acpi_get_dma_attr(to_acpi_device_node(fwnode)); } static bool acpi_fwnode_property_present(const struct fwnode_handle *fwnode, const char *propname) { return !acpi_node_prop_get(fwnode, propname, NULL); } static int acpi_fwnode_property_read_int_array(const struct fwnode_handle *fwnode, const char *propname, unsigned int elem_size, void *val, size_t nval) { enum dev_prop_type type; switch (elem_size) { case sizeof(u8): type = DEV_PROP_U8; break; case sizeof(u16): type = DEV_PROP_U16; break; case sizeof(u32): type = DEV_PROP_U32; break; case sizeof(u64): type = DEV_PROP_U64; break; default: return -ENXIO; } return acpi_node_prop_read(fwnode, propname, type, val, nval); } static int acpi_fwnode_property_read_string_array(const struct fwnode_handle *fwnode, const char *propname, const char **val, size_t nval) { return acpi_node_prop_read(fwnode, propname, DEV_PROP_STRING, val, nval); } static int acpi_fwnode_get_reference_args(const struct fwnode_handle *fwnode, const char *prop, const char *nargs_prop, unsigned int args_count, unsigned int index, struct fwnode_reference_args *args) { return __acpi_node_get_property_reference(fwnode, prop, index, args_count, args); } static const char *acpi_fwnode_get_name(const struct fwnode_handle *fwnode) { const struct acpi_device *adev; struct fwnode_handle *parent; /* Is this the root node? */ parent = fwnode_get_parent(fwnode); if (!parent) return "\\"; fwnode_handle_put(parent); if (is_acpi_data_node(fwnode)) { const struct acpi_data_node *dn = to_acpi_data_node(fwnode); return dn->name; } adev = to_acpi_device_node(fwnode); if (WARN_ON(!adev)) return NULL; return acpi_device_bid(adev); } static const char * acpi_fwnode_get_name_prefix(const struct fwnode_handle *fwnode) { struct fwnode_handle *parent; /* Is this the root node? */ parent = fwnode_get_parent(fwnode); if (!parent) return ""; /* Is this 2nd node from the root? */ parent = fwnode_get_next_parent(parent); if (!parent) return ""; fwnode_handle_put(parent); /* ACPI device or data node. */ return "."; } static struct fwnode_handle * acpi_fwnode_get_parent(struct fwnode_handle *fwnode) { return acpi_node_get_parent(fwnode); } static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint) { struct fwnode_handle *port_fwnode = fwnode_get_parent(fwnode); endpoint->local_fwnode = fwnode; if (fwnode_property_read_u32(port_fwnode, "reg", &endpoint->port)) fwnode_property_read_u32(port_fwnode, "port", &endpoint->port); if (fwnode_property_read_u32(fwnode, "reg", &endpoint->id)) fwnode_property_read_u32(fwnode, "endpoint", &endpoint->id); return 0; } static int acpi_fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index) { struct resource res; int ret; ret = acpi_irq_get(ACPI_HANDLE_FWNODE(fwnode), index, &res); if (ret) return ret; return res.start; } #define DECLARE_ACPI_FWNODE_OPS(ops) \ const struct fwnode_operations ops = { \ .device_is_available = acpi_fwnode_device_is_available, \ .device_get_match_data = acpi_fwnode_device_get_match_data, \ .device_dma_supported = \ acpi_fwnode_device_dma_supported, \ .device_get_dma_attr = acpi_fwnode_device_get_dma_attr, \ .property_present = acpi_fwnode_property_present, \ .property_read_bool = acpi_fwnode_property_present, \ .property_read_int_array = \ acpi_fwnode_property_read_int_array, \ .property_read_string_array = \ acpi_fwnode_property_read_string_array, \ .get_parent = acpi_node_get_parent, \ .get_next_child_node = acpi_get_next_subnode, \ .get_named_child_node = acpi_fwnode_get_named_child_node, \ .get_name = acpi_fwnode_get_name, \ .get_name_prefix = acpi_fwnode_get_name_prefix, \ .get_reference_args = acpi_fwnode_get_reference_args, \ .graph_get_next_endpoint = \ acpi_graph_get_next_endpoint, \ .graph_get_remote_endpoint = \ acpi_graph_get_remote_endpoint, \ .graph_get_port_parent = acpi_fwnode_get_parent, \ .graph_parse_endpoint = acpi_fwnode_graph_parse_endpoint, \ .irq_get = acpi_fwnode_irq_get, \ }; \ EXPORT_SYMBOL_GPL(ops) DECLARE_ACPI_FWNODE_OPS(acpi_device_fwnode_ops); DECLARE_ACPI_FWNODE_OPS(acpi_data_fwnode_ops); const struct fwnode_operations acpi_static_fwnode_ops; bool is_acpi_device_node(const struct fwnode_handle *fwnode) { return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_device_fwnode_ops; } EXPORT_SYMBOL(is_acpi_device_node); bool is_acpi_data_node(const struct fwnode_handle *fwnode) { return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_data_fwnode_ops; } EXPORT_SYMBOL(is_acpi_data_node); |
16 13 16 14 13 97 30 63 46 99 99 99 100 100 98 99 33 74 19 89 3 15 72 71 72 10 72 11 60 61 71 71 73 29 29 23 6 43 47 47 46 47 47 11 11 4 1 3 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPV6 GSO/GRO offload support * Linux INET6 implementation */ #include <linux/kernel.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/printk.h> #include <net/protocol.h> #include <net/ipv6.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/gro.h> #include <net/gso.h> #include "ip6_offload.h" /* All GRO functions are always builtin, except UDP over ipv6, which lays in * ipv6 module, as it depends on UDPv6 lookup function, so we need special care * when ipv6 is built as a module */ #if IS_BUILTIN(CONFIG_IPV6) #define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__) #else #define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) #endif #define indirect_call_gro_receive_l4(f2, f1, cb, head, skb) \ ({ \ unlikely(gro_recursion_inc_test(skb)) ? \ NAPI_GRO_CB(skb)->flush |= 1, NULL : \ INDIRECT_CALL_L4(cb, f2, f1, head, skb); \ }) static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto) { const struct net_offload *ops = NULL; struct ipv6_opt_hdr *opth; for (;;) { int len; ops = rcu_dereference(inet6_offloads[proto]); if (unlikely(!ops)) break; if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) break; opth = skb_gro_header(skb, off + sizeof(*opth), off); if (unlikely(!opth)) break; len = ipv6_optlen(opth); opth = skb_gro_header(skb, off + len, off); if (unlikely(!opth)) break; proto = opth->nexthdr; off += len; } skb_gro_pull(skb, off - skb_gro_receive_network_offset(skb)); return proto; } static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) { const struct net_offload *ops = NULL; for (;;) { struct ipv6_opt_hdr *opth; int len; ops = rcu_dereference(inet6_offloads[proto]); if (unlikely(!ops)) break; if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) break; if (unlikely(!pskb_may_pull(skb, 8))) break; opth = (void *)skb->data; len = ipv6_optlen(opth); if (unlikely(!pskb_may_pull(skb, len))) break; opth = (void *)skb->data; proto = opth->nexthdr; __skb_pull(skb, len); } return proto; } static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct ipv6hdr *ipv6h; const struct net_offload *ops; int proto, err; struct frag_hdr *fptr; unsigned int payload_len; u8 *prevhdr; int offset = 0; bool encap, udpfrag; int nhoff; bool gso_partial; skb_reset_network_header(skb); err = ipv6_hopopt_jumbo_remove(skb); if (err) return ERR_PTR(err); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; encap = SKB_GSO_CB(skb)->encap_level > 0; if (encap) features &= skb->dev->hw_enc_features; SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h); ipv6h = ipv6_hdr(skb); __skb_pull(skb, sizeof(*ipv6h)); segs = ERR_PTR(-EPROTONOSUPPORT); proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6)) udpfrag = proto == IPPROTO_UDP && encap && (skb_shinfo(skb)->gso_type & SKB_GSO_UDP); else udpfrag = proto == IPPROTO_UDP && !skb->encapsulation && (skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { skb_reset_transport_header(skb); segs = ops->callbacks.gso_segment(skb, features); if (!segs) skb->network_header = skb_mac_header(skb) + nhoff - skb->head; } if (IS_ERR_OR_NULL(segs)) goto out; gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); if (gso_partial && skb_is_gso(skb)) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); else payload_len = skb->len - nhoff - sizeof(*ipv6h); ipv6h->payload_len = htons(payload_len); skb->network_header = (u8 *)ipv6h - skb->head; skb_reset_mac_len(skb); if (udpfrag) { int err = ip6_find_1stfragopt(skb, &prevhdr); if (err < 0) { kfree_skb_list(segs); return ERR_PTR(err); } fptr = (struct frag_hdr *)((u8 *)ipv6h + err); fptr->frag_off = htons(offset); if (skb->next) fptr->frag_off |= htons(IP6_MF); offset += (ntohs(ipv6h->payload_len) - sizeof(struct frag_hdr)); } if (encap) skb_reset_inner_headers(skb); } out: return segs; } /* Return the total length of all the extension hdrs, following the same * logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs. */ static int ipv6_exthdrs_len(struct ipv6hdr *iph, const struct net_offload **opps) { struct ipv6_opt_hdr *opth = (void *)iph; int len = 0, proto, optlen = sizeof(*iph); proto = iph->nexthdr; for (;;) { *opps = rcu_dereference(inet6_offloads[proto]); if (unlikely(!(*opps))) break; if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR)) break; opth = (void *)opth + optlen; optlen = ipv6_optlen(opth); len += optlen; proto = opth->nexthdr; } return len; } INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; struct ipv6hdr *iph; unsigned int nlen; unsigned int hlen; unsigned int off; u16 flush = 1; int proto; off = skb_gro_offset(skb); hlen = off + sizeof(*iph); iph = skb_gro_header(skb, hlen, off); if (unlikely(!iph)) goto out; NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; flush += ntohs(iph->payload_len) != skb->len - hlen; proto = iph->nexthdr; ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) { proto = ipv6_gro_pull_exthdrs(skb, hlen, proto); ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out; iph = skb_gro_network_header(skb); } else { skb_gro_pull(skb, sizeof(*iph)); } skb_set_transport_header(skb, skb_gro_offset(skb)); NAPI_GRO_CB(skb)->proto = proto; flush--; nlen = skb_gro_offset(skb) - off; list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; __be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */ if (!NAPI_GRO_CB(p)->same_flow) continue; iph2 = (struct ipv6hdr *)(p->data + off); first_word = *(__be32 *)iph ^ *(__be32 *)iph2; /* All fields must match except length and Traffic Class. * XXX skbs on the gro_list have all been parsed and pulled * already so we don't need to compare nlen * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops))) * memcmp() alone below is sufficient, right? */ if ((first_word & htonl(0xF00FFFFF)) || !ipv6_addr_equal(&iph->saddr, &iph2->saddr) || !ipv6_addr_equal(&iph->daddr, &iph2->daddr) || iph->nexthdr != iph2->nexthdr) { not_same_flow: NAPI_GRO_CB(p)->same_flow = 0; continue; } if (unlikely(nlen > sizeof(struct ipv6hdr))) { if (memcmp(iph + 1, iph2 + 1, nlen - sizeof(struct ipv6hdr))) goto not_same_flow; } } NAPI_GRO_CB(skb)->flush |= flush; skb_gro_postpull_rcsum(skb, iph, nlen); pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive, ops->callbacks.gro_receive, head, skb); out: skb_gro_flush_final(skb, pp, flush); return pp; } static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } NAPI_GRO_CB(skb)->encap_mark = 1; return ipv6_gro_receive(head, skb); } static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } NAPI_GRO_CB(skb)->encap_mark = 1; return inet_gro_receive(head, skb); } INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; struct ipv6hdr *iph; int err = -ENOSYS; u32 payload_len; if (skb->encapsulation) { skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6)); skb_set_inner_network_header(skb, nhoff); } payload_len = skb->len - nhoff - sizeof(*iph); if (unlikely(payload_len > IPV6_MAXPLEN)) { struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); /* Move network header left */ memmove(skb_mac_header(skb) - hoplen, skb_mac_header(skb), skb->transport_header - skb->mac_header); skb->data -= hoplen; skb->len += hoplen; skb->mac_header -= hoplen; skb->network_header -= hoplen; iph = (struct ipv6hdr *)(skb->data + nhoff); hop_jumbo = (struct hop_jumbo_hdr *)(iph + 1); /* Build hop-by-hop options */ hop_jumbo->nexthdr = iph->nexthdr; hop_jumbo->hdrlen = 0; hop_jumbo->tlv_type = IPV6_TLV_JUMBO; hop_jumbo->tlv_len = 4; hop_jumbo->jumbo_payload_len = htonl(payload_len + hoplen); iph->nexthdr = NEXTHDR_HOP; iph->payload_len = 0; } else { iph = (struct ipv6hdr *)(skb->data + nhoff); iph->payload_len = htons(payload_len); } nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete, udp6_gro_complete, skb, nhoff); out: return err; } static int sit_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; return ipv6_gro_complete(skb, nhoff); } static int ip6ip6_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; return ipv6_gro_complete(skb, nhoff); } static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; return inet_gro_complete(skb, nhoff); } static struct sk_buff *sit_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4)) return ERR_PTR(-EINVAL); return ipv6_gso_segment(skb, features); } static struct sk_buff *ip4ip6_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6)) return ERR_PTR(-EINVAL); return inet_gso_segment(skb, features); } static struct sk_buff *ip6ip6_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6)) return ERR_PTR(-EINVAL); return ipv6_gso_segment(skb, features); } static const struct net_offload sit_offload = { .callbacks = { .gso_segment = sit_gso_segment, .gro_receive = sit_ip6ip6_gro_receive, .gro_complete = sit_gro_complete, }, }; static const struct net_offload ip4ip6_offload = { .callbacks = { .gso_segment = ip4ip6_gso_segment, .gro_receive = ip4ip6_gro_receive, .gro_complete = ip4ip6_gro_complete, }, }; static const struct net_offload ip6ip6_offload = { .callbacks = { .gso_segment = ip6ip6_gso_segment, .gro_receive = sit_ip6ip6_gro_receive, .gro_complete = ip6ip6_gro_complete, }, }; static int __init ipv6_offload_init(void) { if (tcpv6_offload_init() < 0) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); if (ipv6_exthdrs_offload_init() < 0) pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__); net_hotdata.ipv6_packet_offload = (struct packet_offload) { .type = cpu_to_be16(ETH_P_IPV6), .callbacks = { .gso_segment = ipv6_gso_segment, .gro_receive = ipv6_gro_receive, .gro_complete = ipv6_gro_complete, }, }; dev_add_offload(&net_hotdata.ipv6_packet_offload); inet_add_offload(&sit_offload, IPPROTO_IPV6); inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6); inet6_add_offload(&ip4ip6_offload, IPPROTO_IPIP); return 0; } fs_initcall(ipv6_offload_init); |
10 6 6 4 4 6 6 6 6 6 4 4 27 4 4 29 29 29 2 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 Syncookies implementation for the Linux kernel * * Authors: * Glenn Griffin <ggriffin.kernel@gmail.com> * * Based on IPv4 implementation by Andi Kleen * linux/net/ipv4/syncookies.c */ #include <linux/tcp.h> #include <linux/random.h> #include <linux/siphash.h> #include <linux/kernel.h> #include <net/secure_seq.h> #include <net/ipv6.h> #include <net/tcp.h> #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) static siphash_aligned_key_t syncookie6_secret[2]; /* RFC 2460, Section 8.3: * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..] * * Due to IPV6_MIN_MTU=1280 the lowest possible MSS is 1220, which allows * using higher values than ipv4 tcp syncookies. * The other values are chosen based on ethernet (1500 and 9k MTU), plus * one that accounts for common encap (PPPoe) overhead. Table must be sorted. */ static __u16 const msstab[] = { 1280 - 60, /* IPV6_MIN_MTU - 60 */ 1480 - 60, 1500 - 60, 9000 - 60, }; static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) { const struct { struct in6_addr saddr; struct in6_addr daddr; u32 count; __be16 sport; __be16 dport; } __aligned(SIPHASH_ALIGNMENT) combined = { .saddr = *saddr, .daddr = *daddr, .count = count, .sport = sport, .dport = dport }; net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret)); return siphash(&combined, offsetofend(typeof(combined), dport), &syncookie6_secret[c]); } static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, __u32 sseq, __u32 data) { u32 count = tcp_cookie_time(); return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + (count << COOKIEBITS) + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK)); } static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, __u32 sseq) { __u32 diff, count = tcp_cookie_time(); cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; return (cookie - cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) & COOKIEMASK; } u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, __u16 *mssp) { int mssind; const __u16 mss = *mssp; for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source, th->dest, ntohl(th->seq), mssind); } EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); return __cookie_v6_init_sequence(iph, th, mssp); } int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th) { __u32 cookie = ntohl(th->ack_seq) - 1; __u32 seq = ntohl(th->seq) - 1; __u32 mssind; mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } EXPORT_SYMBOL_GPL(__cookie_v6_check); static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tcp_opt; u32 tsoff = 0; int mss; if (tcp_synq_no_recent_overflow(sk)) goto out; mss = __cookie_v6_check(ipv6_hdr(skb), tcp_hdr(skb)); if (!mss) { __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED); goto out; } __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { tsoff = secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32); tcp_opt.rcv_tsecr -= tsoff; } if (!cookie_timestamp_decode(net, &tcp_opt)) goto out; return cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, sk, skb, &tcp_opt, mss, tsoff); out: return ERR_PTR(-EINVAL); } struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); struct request_sock *req; struct dst_entry *dst; struct sock *ret = sk; __u8 rcv_wscale; int full_space; SKB_DR(reason); if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || !th->ack || th->rst) goto out; if (cookie_bpf_ok(skb)) { req = cookie_bpf_check(sk, skb); } else { req = cookie_tcp_check(net, sk, skb); if (IS_ERR(req)) goto out; } if (!req) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } ireq = inet_rsk(req); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; if (security_inet_conn_request(sk, skb, req)) { SKB_DR_SET(reason, SECURITY_HOOK); goto out_free; } if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { refcount_inc(&skb->users); ireq->pktopts = skb; } /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); tcp_ao_syncookie(sk, skb, req, AF_INET6); /* * We need to lookup the dst_entry to get the correct window size. * This is taken from tcp_v6_syn_recv_sock. Somebody please enlighten * me if there is a preferred way. */ { struct in6_addr *final_p, final; struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = ireq->ir_v6_rmt_addr; final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); fl6.saddr = ireq->ir_v6_loc_addr; fl6.flowi6_oif = ireq->ir_iif; fl6.flowi6_mark = ireq->ir_mark; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; fl6.flowi6_uid = sk->sk_uid; security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); if (IS_ERR(dst)) { SKB_DR_SET(reason, IP_OUTNOROUTES); goto out_free; } } req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); /* req->syncookie is set true only if ACK is validated * by BPF kfunc, then, rcv_wscale is already configured. */ if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, dst); ret = tcp_get_cookie_sock(sk, skb, req, dst); if (!ret) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } out: return ret; out_free: reqsk_free(req); out_drop: sk_skb_reason_drop(sk, skb, reason); return NULL; } |
35 1 35 35 35 35 1 1 1 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015 HGST, a Western Digital Company. */ #include <linux/err.h> #include <linux/slab.h> #include <rdma/ib_verbs.h> #include "core_priv.h" #include <trace/events/rdma_core.h> /* Max size for shared CQ, may require tuning */ #define IB_MAX_SHARED_CQ_SZ 4096U /* # of WCs to poll for with a single call to ib_poll_cq */ #define IB_POLL_BATCH 16 #define IB_POLL_BATCH_DIRECT 8 /* # of WCs to iterate over before yielding */ #define IB_POLL_BUDGET_IRQ 256 #define IB_POLL_BUDGET_WORKQUEUE 65536 #define IB_POLL_FLAGS \ (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) static const struct dim_cq_moder rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = { {1, 0, 1, 0}, {1, 0, 4, 0}, {2, 0, 4, 0}, {2, 0, 8, 0}, {4, 0, 8, 0}, {16, 0, 8, 0}, {16, 0, 16, 0}, {32, 0, 16, 0}, {32, 0, 32, 0}, }; static void ib_cq_rdma_dim_work(struct work_struct *w) { struct dim *dim = container_of(w, struct dim, work); struct ib_cq *cq = dim->priv; u16 usec = rdma_dim_prof[dim->profile_ix].usec; u16 comps = rdma_dim_prof[dim->profile_ix].comps; dim->state = DIM_START_MEASURE; trace_cq_modify(cq, comps, usec); cq->device->ops.modify_cq(cq, comps, usec); } static void rdma_dim_init(struct ib_cq *cq) { struct dim *dim; if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim || cq->poll_ctx == IB_POLL_DIRECT) return; dim = kzalloc(sizeof(struct dim), GFP_KERNEL); if (!dim) return; dim->state = DIM_START_MEASURE; dim->tune_state = DIM_GOING_RIGHT; dim->profile_ix = RDMA_DIM_START_PROFILE; dim->priv = cq; cq->dim = dim; INIT_WORK(&dim->work, ib_cq_rdma_dim_work); } static void rdma_dim_destroy(struct ib_cq *cq) { if (!cq->dim) return; cancel_work_sync(&cq->dim->work); kfree(cq->dim); } static int __poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) { int rc; rc = ib_poll_cq(cq, num_entries, wc); trace_cq_poll(cq, num_entries, rc); return rc; } static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs, int batch) { int i, n, completed = 0; trace_cq_process(cq); /* * budget might be (-1) if the caller does not * want to bound this call, thus we need unsigned * minimum here. */ while ((n = __poll_cq(cq, min_t(u32, batch, budget - completed), wcs)) > 0) { for (i = 0; i < n; i++) { struct ib_wc *wc = &wcs[i]; if (wc->wr_cqe) wc->wr_cqe->done(cq, wc); else WARN_ON_ONCE(wc->status == IB_WC_SUCCESS); } completed += n; if (n != batch || (budget != -1 && completed >= budget)) break; } return completed; } /** * ib_process_cq_direct - process a CQ in caller context * @cq: CQ to process * @budget: number of CQEs to poll for * * This function is used to process all outstanding CQ entries. * It does not offload CQ processing to a different context and does * not ask for completion interrupts from the HCA. * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger * concurrent processing. * * Note: do not pass -1 as %budget unless it is guaranteed that the number * of completions that will be processed is small. */ int ib_process_cq_direct(struct ib_cq *cq, int budget) { struct ib_wc wcs[IB_POLL_BATCH_DIRECT]; return __ib_process_cq(cq, budget, wcs, IB_POLL_BATCH_DIRECT); } EXPORT_SYMBOL(ib_process_cq_direct); static void ib_cq_completion_direct(struct ib_cq *cq, void *private) { WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq); } static int ib_poll_handler(struct irq_poll *iop, int budget) { struct ib_cq *cq = container_of(iop, struct ib_cq, iop); struct dim *dim = cq->dim; int completed; completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); if (completed < budget) { irq_poll_complete(&cq->iop); if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) { trace_cq_reschedule(cq); irq_poll_sched(&cq->iop); } } if (dim) rdma_dim(dim, completed); return completed; } static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) { trace_cq_schedule(cq); irq_poll_sched(&cq->iop); } static void ib_cq_poll_work(struct work_struct *work) { struct ib_cq *cq = container_of(work, struct ib_cq, work); int completed; completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc, IB_POLL_BATCH); if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) queue_work(cq->comp_wq, &cq->work); else if (cq->dim) rdma_dim(cq->dim, completed); } static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) { trace_cq_schedule(cq); queue_work(cq->comp_wq, &cq->work); } /** * __ib_alloc_cq - allocate a completion queue * @dev: device to allocate the CQ for * @private: driver private data, accessible from cq->cq_context * @nr_cqe: number of CQEs to allocate * @comp_vector: HCA completion vectors for this CQ * @poll_ctx: context to poll the CQ from. * @caller: module owner name. * * This is the proper interface to allocate a CQ for in-kernel users. A * CQ allocated with this interface will automatically be polled from the * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id * to use this CQ abstraction. */ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx, const char *caller) { struct ib_cq_init_attr cq_attr = { .cqe = nr_cqe, .comp_vector = comp_vector, }; struct ib_cq *cq; int ret = -ENOMEM; cq = rdma_zalloc_drv_obj(dev, ib_cq); if (!cq) return ERR_PTR(ret); cq->device = dev; cq->cq_context = private; cq->poll_ctx = poll_ctx; atomic_set(&cq->usecnt, 0); cq->comp_vector = comp_vector; cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); if (!cq->wc) goto out_free_cq; rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); rdma_restrack_set_name(&cq->res, caller); ret = dev->ops.create_cq(cq, &cq_attr, NULL); if (ret) goto out_free_wc; rdma_dim_init(cq); switch (cq->poll_ctx) { case IB_POLL_DIRECT: cq->comp_handler = ib_cq_completion_direct; break; case IB_POLL_SOFTIRQ: cq->comp_handler = ib_cq_completion_softirq; irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler); ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); break; case IB_POLL_WORKQUEUE: case IB_POLL_UNBOUND_WORKQUEUE: cq->comp_handler = ib_cq_completion_workqueue; INIT_WORK(&cq->work, ib_cq_poll_work); ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ? ib_comp_wq : ib_comp_unbound_wq; break; default: ret = -EINVAL; goto out_destroy_cq; } rdma_restrack_add(&cq->res); trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx); return cq; out_destroy_cq: rdma_dim_destroy(cq); cq->device->ops.destroy_cq(cq, NULL); out_free_wc: rdma_restrack_put(&cq->res); kfree(cq->wc); out_free_cq: kfree(cq); trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret); return ERR_PTR(ret); } EXPORT_SYMBOL(__ib_alloc_cq); /** * __ib_alloc_cq_any - allocate a completion queue * @dev: device to allocate the CQ for * @private: driver private data, accessible from cq->cq_context * @nr_cqe: number of CQEs to allocate * @poll_ctx: context to poll the CQ from * @caller: module owner name * * Attempt to spread ULP Completion Queues over each device's interrupt * vectors. A simple best-effort mechanism is used. */ struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private, int nr_cqe, enum ib_poll_context poll_ctx, const char *caller) { static atomic_t counter; int comp_vector = 0; if (dev->num_comp_vectors > 1) comp_vector = atomic_inc_return(&counter) % min_t(int, dev->num_comp_vectors, num_online_cpus()); return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx, caller); } EXPORT_SYMBOL(__ib_alloc_cq_any); /** * ib_free_cq - free a completion queue * @cq: completion queue to free. */ void ib_free_cq(struct ib_cq *cq) { int ret; if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) return; if (WARN_ON_ONCE(cq->cqe_used)) return; switch (cq->poll_ctx) { case IB_POLL_DIRECT: break; case IB_POLL_SOFTIRQ: irq_poll_disable(&cq->iop); break; case IB_POLL_WORKQUEUE: case IB_POLL_UNBOUND_WORKQUEUE: cancel_work_sync(&cq->work); break; default: WARN_ON_ONCE(1); } rdma_dim_destroy(cq); trace_cq_free(cq); ret = cq->device->ops.destroy_cq(cq, NULL); WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail"); rdma_restrack_del(&cq->res); kfree(cq->wc); kfree(cq); } EXPORT_SYMBOL(ib_free_cq); void ib_cq_pool_cleanup(struct ib_device *dev) { struct ib_cq *cq, *n; unsigned int i; for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) { list_for_each_entry_safe(cq, n, &dev->cq_pools[i], pool_entry) { WARN_ON(cq->cqe_used); list_del(&cq->pool_entry); cq->shared = false; ib_free_cq(cq); } } } static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes, enum ib_poll_context poll_ctx) { LIST_HEAD(tmp_list); unsigned int nr_cqs, i; struct ib_cq *cq, *n; int ret; if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); return -EINVAL; } /* * Allocate at least as many CQEs as requested, and otherwise * a reasonable batch size so that we can share CQs between * multiple users instead of allocating a larger number of CQs. */ nr_cqes = min_t(unsigned int, dev->attrs.max_cqe, max(nr_cqes, IB_MAX_SHARED_CQ_SZ)); nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); for (i = 0; i < nr_cqs; i++) { cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx); if (IS_ERR(cq)) { ret = PTR_ERR(cq); goto out_free_cqs; } cq->shared = true; list_add_tail(&cq->pool_entry, &tmp_list); } spin_lock_irq(&dev->cq_pools_lock); list_splice(&tmp_list, &dev->cq_pools[poll_ctx]); spin_unlock_irq(&dev->cq_pools_lock); return 0; out_free_cqs: list_for_each_entry_safe(cq, n, &tmp_list, pool_entry) { cq->shared = false; ib_free_cq(cq); } return ret; } /** * ib_cq_pool_get() - Find the least used completion queue that matches * a given cpu hint (or least used for wild card affinity) and fits * nr_cqe. * @dev: rdma device * @nr_cqe: number of needed cqe entries * @comp_vector_hint: completion vector hint (-1) for the driver to assign * a comp vector based on internal counter * @poll_ctx: cq polling context * * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and * claim entries in it for us. In case there is no available cq, allocate * a new cq with the requirements and add it to the device pool. * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value * for @poll_ctx. */ struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, int comp_vector_hint, enum ib_poll_context poll_ctx) { static unsigned int default_comp_vector; unsigned int vector, num_comp_vectors; struct ib_cq *cq, *found = NULL; int ret; if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); return ERR_PTR(-EINVAL); } num_comp_vectors = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); /* Project the affinty to the device completion vector range */ if (comp_vector_hint < 0) { comp_vector_hint = (READ_ONCE(default_comp_vector) + 1) % num_comp_vectors; WRITE_ONCE(default_comp_vector, comp_vector_hint); } vector = comp_vector_hint % num_comp_vectors; /* * Find the least used CQ with correct affinity and * enough free CQ entries */ while (!found) { spin_lock_irq(&dev->cq_pools_lock); list_for_each_entry(cq, &dev->cq_pools[poll_ctx], pool_entry) { /* * Check to see if we have found a CQ with the * correct completion vector */ if (vector != cq->comp_vector) continue; if (cq->cqe_used + nr_cqe > cq->cqe) continue; found = cq; break; } if (found) { found->cqe_used += nr_cqe; spin_unlock_irq(&dev->cq_pools_lock); return found; } spin_unlock_irq(&dev->cq_pools_lock); /* * Didn't find a match or ran out of CQs in the device * pool, allocate a new array of CQs. */ ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx); if (ret) return ERR_PTR(ret); } return found; } EXPORT_SYMBOL(ib_cq_pool_get); /** * ib_cq_pool_put - Return a CQ taken from a shared pool. * @cq: The CQ to return. * @nr_cqe: The max number of cqes that the user had requested. */ void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe) { if (WARN_ON_ONCE(nr_cqe > cq->cqe_used)) return; spin_lock_irq(&cq->device->cq_pools_lock); cq->cqe_used -= nr_cqe; spin_unlock_irq(&cq->device->cq_pools_lock); } EXPORT_SYMBOL(ib_cq_pool_put); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 | // SPDX-License-Identifier: GPL-2.0-only #ifndef _NFT_SET_PIPAPO_H #include <linux/log2.h> #include <net/ipv6.h> /* For the maximum length of a field */ /* Count of concatenated fields depends on count of 32-bit nftables registers */ #define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT /* Restrict usage to multiple fields, make sure rbtree is used otherwise */ #define NFT_PIPAPO_MIN_FIELDS 2 /* Largest supported field size */ #define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr)) #define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE) /* Bits to be grouped together in table buckets depending on set size */ #define NFT_PIPAPO_GROUP_BITS_INIT NFT_PIPAPO_GROUP_BITS_SMALL_SET #define NFT_PIPAPO_GROUP_BITS_SMALL_SET 8 #define NFT_PIPAPO_GROUP_BITS_LARGE_SET 4 #define NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4 \ BUILD_BUG_ON((NFT_PIPAPO_GROUP_BITS_SMALL_SET != 8) || \ (NFT_PIPAPO_GROUP_BITS_LARGE_SET != 4)) #define NFT_PIPAPO_GROUPS_PER_BYTE(f) (BITS_PER_BYTE / (f)->bb) /* If a lookup table gets bigger than NFT_PIPAPO_LT_SIZE_HIGH, switch to the * small group width, and switch to the big group width if the table gets * smaller than NFT_PIPAPO_LT_SIZE_LOW. * * Picking 2MiB as threshold (for a single table) avoids as much as possible * crossing page boundaries on most architectures (x86-64 and MIPS huge pages, * ARMv7 supersections, POWER "large" pages, SPARC Level 1 regions, etc.), which * keeps performance nice in case kvmalloc() gives us non-contiguous areas. */ #define NFT_PIPAPO_LT_SIZE_THRESHOLD (1 << 21) #define NFT_PIPAPO_LT_SIZE_HYSTERESIS (1 << 16) #define NFT_PIPAPO_LT_SIZE_HIGH NFT_PIPAPO_LT_SIZE_THRESHOLD #define NFT_PIPAPO_LT_SIZE_LOW NFT_PIPAPO_LT_SIZE_THRESHOLD - \ NFT_PIPAPO_LT_SIZE_HYSTERESIS /* Fields are padded to 32 bits in input registers */ #define NFT_PIPAPO_GROUPS_PADDED_SIZE(f) \ (round_up((f)->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f), sizeof(u32))) #define NFT_PIPAPO_GROUPS_PADDING(f) \ (NFT_PIPAPO_GROUPS_PADDED_SIZE(f) - (f)->groups / \ NFT_PIPAPO_GROUPS_PER_BYTE(f)) /* Number of buckets given by 2 ^ n, with n bucket bits */ #define NFT_PIPAPO_BUCKETS(bb) (1 << (bb)) /* Each n-bit range maps to up to n * 2 rules */ #define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2)) /* Use the rest of mapping table buckets for rule indices, but it makes no sense * to exceed 32 bits */ #if BITS_PER_LONG == 64 #define NFT_PIPAPO_MAP_TOBITS 32 #else #define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS) #endif /* ...which gives us the highest allowed index for a rule */ #define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \ - (1UL << NFT_PIPAPO_MAP_NBITS)) /* Definitions for vectorised implementations */ #ifdef NFT_PIPAPO_ALIGN #define NFT_PIPAPO_ALIGN_HEADROOM \ (NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN) #define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN)) #else #define NFT_PIPAPO_ALIGN_HEADROOM 0 #define NFT_PIPAPO_LT_ALIGN(lt) (lt) #endif /* NFT_PIPAPO_ALIGN */ #define nft_pipapo_for_each_field(field, index, match) \ for ((field) = (match)->f, (index) = 0; \ (index) < (match)->field_count; \ (index)++, (field)++) /** * union nft_pipapo_map_bucket - Bucket of mapping table * @to: First rule number (in next field) this rule maps to * @n: Number of rules (in next field) this rule maps to * @e: If there's no next field, pointer to element this rule maps to */ union nft_pipapo_map_bucket { struct { #if BITS_PER_LONG == 64 static_assert(NFT_PIPAPO_MAP_TOBITS <= 32); u32 to; static_assert(NFT_PIPAPO_MAP_NBITS <= 32); u32 n; #else unsigned long to:NFT_PIPAPO_MAP_TOBITS; unsigned long n:NFT_PIPAPO_MAP_NBITS; #endif }; struct nft_pipapo_elem *e; }; /** * struct nft_pipapo_field - Lookup, mapping tables and related data for a field * @rules: Number of inserted rules * @bsize: Size of each bucket in lookup table, in longs * @rules_alloc: Number of allocated rules, always >= rules * @groups: Amount of bit groups * @bb: Number of bits grouped together in lookup table buckets * @lt: Lookup table: 'groups' rows of buckets * @mt: Mapping table: one bucket per rule */ struct nft_pipapo_field { unsigned int rules; unsigned int bsize; unsigned int rules_alloc; u8 groups; u8 bb; unsigned long *lt; union nft_pipapo_map_bucket *mt; }; /** * struct nft_pipapo_scratch - percpu data used for lookup and matching * @map_index: Current working bitmap index, toggled between field matches * @align_off: Offset to get the originally allocated address * @map: store partial matching results during lookup */ struct nft_pipapo_scratch { u8 map_index; u32 align_off; unsigned long map[]; }; /** * struct nft_pipapo_match - Data used for lookup and matching * @field_count: Amount of fields in set * @bsize_max: Maximum lookup table bucket size of all fields, in longs * @scratch: Preallocated per-CPU maps for partial matching results * @rcu: Matching data is swapped on commits * @f: Fields, with lookup and mapping tables */ struct nft_pipapo_match { u8 field_count; unsigned int bsize_max; struct nft_pipapo_scratch * __percpu *scratch; struct rcu_head rcu; struct nft_pipapo_field f[] __counted_by(field_count); }; /** * struct nft_pipapo - Representation of a set * @match: Currently in-use matching data * @clone: Copy where pending insertions and deletions are kept * @width: Total bytes to be matched for one packet, including padding * @last_gc: Timestamp of last garbage collection run, jiffies */ struct nft_pipapo { struct nft_pipapo_match __rcu *match; struct nft_pipapo_match *clone; int width; unsigned long last_gc; }; struct nft_pipapo_elem; /** * struct nft_pipapo_elem - API-facing representation of single set element * @priv: element placeholder * @ext: nftables API extensions */ struct nft_pipapo_elem { struct nft_elem_priv priv; struct nft_set_ext ext; }; int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, unsigned long *dst, const union nft_pipapo_map_bucket *mt, bool match_only); /** * pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets * @f: Field including lookup table * @dst: Area to store result * @data: Input data selecting table buckets */ static inline void pipapo_and_field_buckets_4bit(const struct nft_pipapo_field *f, unsigned long *dst, const u8 *data) { unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt); int group; for (group = 0; group < f->groups; group += BITS_PER_BYTE / 4, data++) { u8 v; v = *data >> 4; __bitmap_and(dst, dst, lt + v * f->bsize, f->bsize * BITS_PER_LONG); lt += f->bsize * NFT_PIPAPO_BUCKETS(4); v = *data & 0x0f; __bitmap_and(dst, dst, lt + v * f->bsize, f->bsize * BITS_PER_LONG); lt += f->bsize * NFT_PIPAPO_BUCKETS(4); } } /** * pipapo_and_field_buckets_8bit() - Intersect 8-bit buckets * @f: Field including lookup table * @dst: Area to store result * @data: Input data selecting table buckets */ static inline void pipapo_and_field_buckets_8bit(const struct nft_pipapo_field *f, unsigned long *dst, const u8 *data) { unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt); int group; for (group = 0; group < f->groups; group++, data++) { __bitmap_and(dst, dst, lt + *data * f->bsize, f->bsize * BITS_PER_LONG); lt += f->bsize * NFT_PIPAPO_BUCKETS(8); } } /** * pipapo_estimate_size() - Estimate worst-case for set size * @desc: Set description, element count and field description used here * * The size for this set type can vary dramatically, as it depends on the number * of rules (composing netmasks) the entries expand to. We compute the worst * case here. * * In general, for a non-ranged entry or a single composing netmask, we need * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that * is, each input bit needs four bits of matching data), plus a bucket in the * mapping table for each field. * * Return: worst-case set size in bytes, 0 on any overflow */ static u64 pipapo_estimate_size(const struct nft_set_desc *desc) { unsigned long entry_size; u64 size; int i; for (i = 0, entry_size = 0; i < desc->field_count; i++) { unsigned long rules; if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES) return 0; /* Worst-case ranges for each concatenated field: each n-bit * field can expand to up to n * 2 rules in each bucket, and * each rule also needs a mapping bucket. */ rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2; entry_size += rules * NFT_PIPAPO_BUCKETS(NFT_PIPAPO_GROUP_BITS_INIT) / BITS_PER_BYTE; entry_size += rules * sizeof(union nft_pipapo_map_bucket); } /* Rules in lookup and mapping tables are needed for each entry */ size = desc->size * entry_size; if (size && div_u64(size, desc->size) != entry_size) return 0; size += sizeof(struct nft_pipapo) + sizeof(struct nft_pipapo_match) * 2; size += sizeof(struct nft_pipapo_field) * desc->field_count; return size; } /** * pipapo_resmap_init() - Initialise result map before first use * @m: Matching data, including mapping table * @res_map: Result map * * Initialize all bits covered by the first field to one, so that after * the first step, only the matching bits of the first bit group remain. * * If other fields have a large bitmap, set remainder of res_map to 0. */ static inline void pipapo_resmap_init(const struct nft_pipapo_match *m, unsigned long *res_map) { const struct nft_pipapo_field *f = m->f; int i; for (i = 0; i < f->bsize; i++) res_map[i] = ULONG_MAX; for (i = f->bsize; i < m->bsize_max; i++) res_map[i] = 0ul; } #endif /* _NFT_SET_PIPAPO_H */ |
114 113 2154 19 5 2133 1454 2138 1505 1507 1507 3 1505 1487 1433 133 1467 61 61 61 1307 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 | /* * Performance events x86 architecture code * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * Copyright (C) 2009 Google, Inc., Stephane Eranian * * For licencing details see kernel-base/COPYING */ #include <linux/perf_event.h> #include <linux/capability.h> #include <linux/notifier.h> #include <linux/hardirq.h> #include <linux/kprobes.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kdebug.h> #include <linux/sched/mm.h> #include <linux/sched/clock.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/bitops.h> #include <linux/device.h> #include <linux/nospec.h> #include <linux/static_call.h> #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/nmi.h> #include <asm/smp.h> #include <asm/alternative.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/desc.h> #include <asm/ldt.h> #include <asm/unwind.h> #include <asm/uprobes.h> #include <asm/ibt.h> #include "perf_event.h" struct x86_pmu x86_pmu __read_mostly; static struct pmu pmu; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, .pmu = &pmu, }; DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); /* * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined * from just a typename, as opposed to an actual function. */ DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq); DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign); DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period); DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling); DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task); DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter); DEFINE_STATIC_CALL_NULL(x86_pmu_late_setup, *x86_pmu.late_setup); /* * This one is magic, it will get called even when PMU init fails (because * there is no PMU), in which case it should simply return NULL. */ DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ u64 x86_perf_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.cntval_bits; u64 prev_raw_count, new_raw_count; u64 delta; if (unlikely(!hwc->event_base)) return 0; /* * Careful: an NMI might modify the previous event value. * * Our tactic to handle this is to first atomically read and * exchange a new raw count - then add that new-prev delta * count to the generic event atomically: */ prev_raw_count = local64_read(&hwc->prev_count); do { rdpmcl(hwc->event_base_rdpmc, new_raw_count); } while (!local64_try_cmpxchg(&hwc->prev_count, &prev_raw_count, new_raw_count)); /* * Now we have the new raw value and have updated the prev * timestamp already. We can now calculate the elapsed delta * (event-)time and add that to the generic event. * * Careful, not all hw sign-extends above the physical width * of the count. */ delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; local64_add(delta, &event->count); local64_sub(delta, &hwc->period_left); return new_raw_count; } /* * Find and validate any extra registers to set up. */ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) { struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); struct hw_perf_event_extra *reg; struct extra_reg *er; reg = &event->hw.extra_reg; if (!extra_regs) return 0; for (er = extra_regs; er->msr; er++) { if (er->event != (config & er->config_mask)) continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; /* Check if the extra msrs can be safely accessed*/ if (!er->extra_msr_access) return -ENXIO; reg->idx = er->idx; reg->config = event->attr.config1; reg->reg = er->msr; break; } return 0; } static atomic_t active_events; static atomic_t pmc_refcount; static DEFINE_MUTEX(pmc_reserve_mutex); #ifdef CONFIG_X86_LOCAL_APIC static inline u64 get_possible_counter_mask(void) { u64 cntr_mask = x86_pmu.cntr_mask64; int i; if (!is_hybrid()) return cntr_mask; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) cntr_mask |= x86_pmu.hybrid_pmu[i].cntr_mask64; return cntr_mask; } static bool reserve_pmc_hardware(void) { u64 cntr_mask = get_possible_counter_mask(); int i, end; for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } return true; eventsel_fail: end = i; for_each_set_bit(i, (unsigned long *)&cntr_mask, end) release_evntsel_nmi(x86_pmu_config_addr(i)); i = X86_PMC_IDX_MAX; perfctr_fail: end = i; for_each_set_bit(i, (unsigned long *)&cntr_mask, end) release_perfctr_nmi(x86_pmu_event_addr(i)); return false; } static void release_pmc_hardware(void) { u64 cntr_mask = get_possible_counter_mask(); int i; for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) { release_perfctr_nmi(x86_pmu_event_addr(i)); release_evntsel_nmi(x86_pmu_config_addr(i)); } } #else static bool reserve_pmc_hardware(void) { return true; } static void release_pmc_hardware(void) {} #endif bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask, unsigned long *fixed_cntr_mask) { u64 val, val_fail = -1, val_new= ~0; int i, reg, reg_fail = -1, ret = 0; int bios_fail = 0; int reg_safe = -1; /* * Check to see if the BIOS enabled any of the counters, if so * complain and bail. */ for_each_set_bit(i, cntr_mask, X86_PMC_IDX_MAX) { reg = x86_pmu_config_addr(i); ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { bios_fail = 1; val_fail = val; reg_fail = reg; } else { reg_safe = i; } } if (*(u64 *)fixed_cntr_mask) { reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; for_each_set_bit(i, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(i, pmu)) continue; if (val & (0x03ULL << i*4)) { bios_fail = 1; val_fail = val; reg_fail = reg; } } } /* * If all the counters are enabled, the below test will always * fail. The tools will also become useless in this scenario. * Just fail and disable the hardware counters. */ if (reg_safe == -1) { reg = reg_safe; goto msr_fail; } /* * Read the current value, change it and read it back to see if it * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ reg = x86_pmu_event_addr(reg_safe); if (rdmsrl_safe(reg, &val)) goto msr_fail; val ^= 0xffffUL; ret = wrmsrl_safe(reg, val); ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; /* * We still allow the PMU driver to operate: */ if (bios_fail) { pr_cont("Broken BIOS detected, complain to your hardware vendor.\n"); pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail); } return true; msr_fail: if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { pr_cont("PMU not available due to virtualization, using software events only.\n"); } else { pr_cont("Broken PMU hardware detected, using software events only.\n"); pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); } return false; } static void hw_perf_event_destroy(struct perf_event *event) { x86_release_hardware(); atomic_dec(&active_events); } void hw_perf_lbr_event_destroy(struct perf_event *event) { hw_perf_event_destroy(event); /* undo the lbr/bts event accounting */ x86_del_exclusive(x86_lbr_exclusive_lbr); } static inline int x86_pmu_initialized(void) { return x86_pmu.handle_irq != NULL; } static inline int set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) { struct perf_event_attr *attr = &event->attr; unsigned int cache_type, cache_op, cache_result; u64 config, val; config = attr->config; cache_type = (config >> 0) & 0xff; if (cache_type >= PERF_COUNT_HW_CACHE_MAX) return -EINVAL; cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX); cache_op = (config >> 8) & 0xff; if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) return -EINVAL; cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX); cache_result = (config >> 16) & 0xff; if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) return -EINVAL; cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result]; if (val == 0) return -ENOENT; if (val == -1) return -EINVAL; hwc->config |= val; attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result]; return x86_pmu_extra_regs(val, event); } int x86_reserve_hardware(void) { int err = 0; if (!atomic_inc_not_zero(&pmc_refcount)) { mutex_lock(&pmc_reserve_mutex); if (atomic_read(&pmc_refcount) == 0) { if (!reserve_pmc_hardware()) { err = -EBUSY; } else { reserve_ds_buffers(); reserve_lbr_buffers(); } } if (!err) atomic_inc(&pmc_refcount); mutex_unlock(&pmc_reserve_mutex); } return err; } void x86_release_hardware(void) { if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { release_pmc_hardware(); release_ds_buffers(); release_lbr_buffers(); mutex_unlock(&pmc_reserve_mutex); } } /* * Check if we can create event of a certain type (that no conflicting events * are present). */ int x86_add_exclusive(unsigned int what) { int i; /* * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS. * LBR and BTS are still mutually exclusive. */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) goto out; if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { mutex_lock(&pmc_reserve_mutex); for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) { if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i])) goto fail_unlock; } atomic_inc(&x86_pmu.lbr_exclusive[what]); mutex_unlock(&pmc_reserve_mutex); } out: atomic_inc(&active_events); return 0; fail_unlock: mutex_unlock(&pmc_reserve_mutex); return -EBUSY; } void x86_del_exclusive(unsigned int what) { atomic_dec(&active_events); /* * See the comment in x86_add_exclusive(). */ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) return; atomic_dec(&x86_pmu.lbr_exclusive[what]); } int x86_setup_perfctr(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; u64 config; if (!is_sampling_event(event)) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); } if (attr->type == event->pmu->type) return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) return set_ext_hw_attr(hwc, event); if (attr->config >= x86_pmu.max_events) return -EINVAL; attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events); /* * The generic map: */ config = x86_pmu.event_map(attr->config); if (config == 0) return -ENOENT; if (config == -1LL) return -EINVAL; hwc->config |= config; return 0; } /* * check that branch_sample_type is compatible with * settings needed for precise_ip > 1 which implies * using the LBR to capture ALL taken branches at the * priv levels of the measurement */ static inline int precise_br_compat(struct perf_event *event) { u64 m = event->attr.branch_sample_type; u64 b = 0; /* must capture all branches */ if (!(m & PERF_SAMPLE_BRANCH_ANY)) return 0; m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_user) b |= PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_kernel) b |= PERF_SAMPLE_BRANCH_KERNEL; /* * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 */ return m == b; } int x86_pmu_max_precise(void) { int precise = 0; /* Support for constant skid */ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; /* Support for IP fixup */ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2) precise++; if (x86_pmu.pebs_prec_dist) precise++; } return precise; } int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { int precise = x86_pmu_max_precise(); if (event->attr.precise_ip > precise) return -EOPNOTSUPP; /* There's no sense in having PEBS for non sampling events: */ if (!is_sampling_event(event)) return -EINVAL; } /* * check that PEBS LBR correction does not conflict with * whatever the user is asking with attr->branch_sample_type */ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) { u64 *br_type = &event->attr.branch_sample_type; if (has_branch_stack(event)) { if (!precise_br_compat(event)) return -EOPNOTSUPP; /* branch_sample_type is compatible */ } else { /* * user did not specify branch_sample_type * * For PEBS fixups, we capture all * the branches at the priv level of the * event. */ *br_type = PERF_SAMPLE_BRANCH_ANY; if (!event->attr.exclude_user) *br_type |= PERF_SAMPLE_BRANCH_USER; if (!event->attr.exclude_kernel) *br_type |= PERF_SAMPLE_BRANCH_KERNEL; } } if (branch_sample_call_stack(event)) event->attach_state |= PERF_ATTACH_TASK_DATA; /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ event->hw.config = ARCH_PERFMON_EVENTSEL_INT; /* * Count user and OS events unless requested not to */ if (!event->attr.exclude_user) event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; if (!event->attr.exclude_kernel) event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; if (event->attr.type == event->pmu->type) event->hw.config |= x86_pmu_get_event_config(event); if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) { s64 left = event->attr.sample_period; x86_pmu.limit_period(event, &left); if (left > event->attr.sample_period) return -EINVAL; } /* sample_regs_user never support XMM registers */ if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK)) return -EINVAL; /* * Besides the general purpose registers, XMM registers may * be collected in PEBS on some platforms, e.g. Icelake */ if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) { if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS)) return -EINVAL; if (!event->attr.precise_ip) return -EINVAL; } return x86_setup_perfctr(event); } /* * Setup the hardware configuration for a given attr_type */ static int __x86_pmu_event_init(struct perf_event *event) { int err; if (!x86_pmu_initialized()) return -ENODEV; err = x86_reserve_hardware(); if (err) return err; atomic_inc(&active_events); event->destroy = hw_perf_event_destroy; event->hw.idx = -1; event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; /* mark unused */ event->hw.extra_reg.idx = EXTRA_REG_NONE; event->hw.branch_reg.idx = EXTRA_REG_NONE; return x86_pmu.hw_config(event); } void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; u64 val; if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(x86_pmu_config_addr(idx), val); if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(x86_pmu_config_addr(idx), val); if (is_counter_pair(hwc)) wrmsrl(x86_pmu_config_addr(idx + 1), 0); } } struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) { return static_call(x86_pmu_guest_get_msrs)(nr, data); } EXPORT_SYMBOL_GPL(perf_guest_get_msrs); /* * There may be PMI landing after enabled=0. The PMI hitting could be before or * after disable_all. * * If PMI hits before disable_all, the PMU will be disabled in the NMI handler. * It will not be re-enabled in the NMI handler again, because enabled=0. After * handling the NMI, disable_all will be called, which will not change the * state either. If PMI hits after disable_all, the PMU is already disabled * before entering NMI handler. The NMI handler will not change the state * either. * * So either situation is harmless. */ static void x86_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (!x86_pmu_initialized()) return; if (!cpuc->enabled) return; cpuc->n_added = 0; cpuc->enabled = 0; barrier(); static_call(x86_pmu_disable_all)(); } void x86_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask)) continue; __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } } int is_x86_event(struct perf_event *event) { int i; if (!is_hybrid()) return event->pmu == &pmu; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu) return true; } return false; } struct pmu *x86_get_pmu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); /* * All CPUs of the hybrid type have been offline. * The x86_get_pmu() should not be invoked. */ if (WARN_ON_ONCE(!cpuc->pmu)) return &pmu; return cpuc->pmu; } /* * Event scheduler state: * * Assign events iterating over all events and counters, beginning * with events with least weights first. Keep the current iterator * state in struct sched_state. */ struct sched_state { int weight; int event; /* event index */ int counter; /* counter index */ int unassigned; /* number of events to be assigned left */ int nr_gp; /* number of GP counters used */ u64 used; }; /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ #define SCHED_STATES_MAX 2 struct perf_sched { int max_weight; int max_events; int max_gp; int saved_states; struct event_constraint **constraints; struct sched_state state; struct sched_state saved[SCHED_STATES_MAX]; }; /* * Initialize iterator that runs through all events and counters. */ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, int num, int wmin, int wmax, int gpmax) { int idx; memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; sched->max_gp = gpmax; sched->constraints = constraints; for (idx = 0; idx < num; idx++) { if (constraints[idx]->weight == wmin) break; } sched->state.event = idx; /* start with min weight */ sched->state.weight = wmin; sched->state.unassigned = num; } static void perf_sched_save_state(struct perf_sched *sched) { if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) return; sched->saved[sched->saved_states] = sched->state; sched->saved_states++; } static bool perf_sched_restore_state(struct perf_sched *sched) { if (!sched->saved_states) return false; sched->saved_states--; sched->state = sched->saved[sched->saved_states]; /* this assignment didn't work out */ /* XXX broken vs EVENT_PAIR */ sched->state.used &= ~BIT_ULL(sched->state.counter); /* try the next one */ sched->state.counter++; return true; } /* * Select a counter for the current event to schedule. Return true on * success. */ static bool __perf_sched_find_counter(struct perf_sched *sched) { struct event_constraint *c; int idx; if (!sched->state.unassigned) return false; if (sched->state.event >= sched->max_events) return false; c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { idx = INTEL_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { u64 mask = BIT_ULL(idx); if (sched->state.used & mask) continue; sched->state.used |= mask; goto done; } } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { u64 mask = BIT_ULL(idx); if (c->flags & PERF_X86_EVENT_PAIR) mask |= mask << 1; if (sched->state.used & mask) continue; if (sched->state.nr_gp++ >= sched->max_gp) return false; sched->state.used |= mask; goto done; } return false; done: sched->state.counter = idx; if (c->overlap) perf_sched_save_state(sched); return true; } static bool perf_sched_find_counter(struct perf_sched *sched) { while (!__perf_sched_find_counter(sched)) { if (!perf_sched_restore_state(sched)) return false; } return true; } /* * Go through all unassigned events and find the next one to schedule. * Take events with the least weight first. Return true on success. */ static bool perf_sched_next_event(struct perf_sched *sched) { struct event_constraint *c; if (!sched->state.unassigned || !--sched->state.unassigned) return false; do { /* next event */ sched->state.event++; if (sched->state.event >= sched->max_events) { /* next weight */ sched->state.event = 0; sched->state.weight++; if (sched->state.weight > sched->max_weight) return false; } c = sched->constraints[sched->state.event]; } while (c->weight != sched->state.weight); sched->state.counter = 0; /* start with first counter */ return true; } /* * Assign a counter for each event. */ int perf_assign_events(struct event_constraint **constraints, int n, int wmin, int wmax, int gpmax, int *assign) { struct perf_sched sched; perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); do { if (!perf_sched_find_counter(&sched)) break; /* failed */ if (assign) assign[sched.state.event] = sched.state.counter; } while (perf_sched_next_event(&sched)); return sched.state.unassigned; } EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { struct event_constraint *c; struct perf_event *e; int n0, i, wmin, wmax, unsched = 0; struct hw_perf_event *hwc; u64 used_mask = 0; /* * Compute the number of events already present; see x86_pmu_add(), * validate_group() and x86_pmu_commit_txn(). For the former two * cpuc->n_events hasn't been updated yet, while for the latter * cpuc->n_txn contains the number of events added in the current * transaction. */ n0 = cpuc->n_events; if (cpuc->txn_flags & PERF_PMU_TXN_ADD) n0 -= cpuc->n_txn; static_call_cond(x86_pmu_start_scheduling)(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { c = cpuc->event_constraint[i]; /* * Previously scheduled events should have a cached constraint, * while new events should not have one. */ WARN_ON_ONCE((c && i >= n0) || (!c && i < n0)); /* * Request constraints for new events; or for those events that * have a dynamic constraint -- for those the constraint can * change due to external factors (sibling state, allow_tfa). */ if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]); cpuc->event_constraint[i] = c; } wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); } /* * fastpath, try to reuse previous register */ for (i = 0; i < n; i++) { u64 mask; hwc = &cpuc->event_list[i]->hw; c = cpuc->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) break; /* constraint still honored */ if (!test_bit(hwc->idx, c->idxmsk)) break; mask = BIT_ULL(hwc->idx); if (is_counter_pair(hwc)) mask |= mask << 1; /* not already used */ if (used_mask & mask) break; used_mask |= mask; if (assign) assign[i] = hwc->idx; } /* slow path */ if (i != n) { int gpmax = x86_pmu_max_num_counters(cpuc->pmu); /* * Do not allow scheduling of more than half the available * generic counters. * * This helps avoid counter starvation of sibling thread by * ensuring at most half the counters cannot be in exclusive * mode. There is no designated counters for the limits. Any * N/2 counters can be used. This helps with events with * specific counter constraints. */ if (is_ht_workaround_enabled() && !cpuc->is_fake && READ_ONCE(cpuc->excl_cntrs->exclusive_present)) gpmax /= 2; /* * Reduce the amount of available counters to allow fitting * the extra Merge events needed by large increment events. */ if (x86_pmu.flags & PMU_FL_PAIR) { gpmax -= cpuc->n_pair; WARN_ON(gpmax <= 0); } unsched = perf_assign_events(cpuc->event_constraint, n, wmin, wmax, gpmax, assign); } /* * In case of success (unsched = 0), mark events as committed, * so we do not put_constraint() in case new events are added * and fail to be scheduled * * We invoke the lower level commit callback to lock the resource * * We do not need to do all of this in case we are called to * validate an event group (assign == NULL) */ if (!unsched && assign) { for (i = 0; i < n; i++) static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]); } else { for (i = n0; i < n; i++) { e = cpuc->event_list[i]; /* * release events that failed scheduling */ static_call_cond(x86_pmu_put_event_constraints)(cpuc, e); cpuc->event_constraint[i] = NULL; } } static_call_cond(x86_pmu_stop_scheduling)(cpuc); return unsched ? -EINVAL : 0; } static int add_nr_metric_event(struct cpu_hw_events *cpuc, struct perf_event *event) { if (is_metric_event(event)) { if (cpuc->n_metric == INTEL_TD_METRIC_NUM) return -EINVAL; cpuc->n_metric++; cpuc->n_txn_metric++; } return 0; } static void del_nr_metric_event(struct cpu_hw_events *cpuc, struct perf_event *event) { if (is_metric_event(event)) cpuc->n_metric--; } static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, int max_count, int n) { union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) return -EINVAL; if (n >= max_count + cpuc->n_metric) return -EINVAL; cpuc->event_list[n] = event; if (is_counter_pair(&event->hw)) { cpuc->n_pair++; cpuc->n_txn_pair++; } return 0; } /* * dogrp: true if must collect siblings events (group) * returns total number of events and error code */ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { struct perf_event *event; int n, max_count; max_count = x86_pmu_num_counters(cpuc->pmu) + x86_pmu_num_counters_fixed(cpuc->pmu); /* current number of events already accepted */ n = cpuc->n_events; if (!cpuc->n_events) cpuc->pebs_output = 0; if (!cpuc->is_fake && leader->attr.precise_ip) { /* * For PEBS->PT, if !aux_event, the group leader (PT) went * away, the group was broken down and this singleton event * can't schedule any more. */ if (is_pebs_pt(leader) && !leader->aux_event) return -EINVAL; /* * pebs_output: 0: no PEBS so far, 1: PT, 2: DS */ if (cpuc->pebs_output && cpuc->pebs_output != is_pebs_pt(leader) + 1) return -EINVAL; cpuc->pebs_output = is_pebs_pt(leader) + 1; } if (is_x86_event(leader)) { if (collect_event(cpuc, leader, max_count, n)) return -EINVAL; n++; } if (!dogrp) return n; for_each_sibling_event(event, leader) { if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) continue; if (collect_event(cpuc, event, max_count, n)) return -EINVAL; n++; } return n; } static inline void x86_assign_hw_event(struct perf_event *event, struct cpu_hw_events *cpuc, int i) { struct hw_perf_event *hwc = &event->hw; int idx; idx = hwc->idx = cpuc->assign[i]; hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; static_call_cond(x86_pmu_assign)(event, idx); switch (hwc->idx) { case INTEL_PMC_IDX_FIXED_BTS: case INTEL_PMC_IDX_FIXED_VLBR: hwc->config_base = 0; hwc->event_base = 0; break; case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: /* All the metric events are mapped onto the fixed counter 3. */ idx = INTEL_PMC_IDX_FIXED_SLOTS; fallthrough; case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; hwc->event_base = x86_pmu_fixed_ctr_addr(idx - INTEL_PMC_IDX_FIXED); hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | INTEL_PMC_FIXED_RDPMC_BASE; break; default: hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); break; } } /** * x86_perf_rdpmc_index - Return PMC counter used for event * @event: the perf_event to which the PMC counter was assigned * * The counter assigned to this performance event may change if interrupts * are enabled. This counter should thus never be used while interrupts are * enabled. Before this function is used to obtain the assigned counter the * event should be checked for validity using, for example, * perf_event_read_local(), within the same interrupt disabled section in * which this counter is planned to be used. * * Return: The index of the performance monitoring counter assigned to * @perf_event. */ int x86_perf_rdpmc_index(struct perf_event *event) { lockdep_assert_irqs_disabled(); return event->hw.event_base_rdpmc; } static inline int match_prev_assignment(struct hw_perf_event *hwc, struct cpu_hw_events *cpuc, int i) { return hwc->idx == cpuc->assign[i] && hwc->last_cpu == smp_processor_id() && hwc->last_tag == cpuc->tags[i]; } static void x86_pmu_start(struct perf_event *event, int flags); static void x86_pmu_enable(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_event *event; struct hw_perf_event *hwc; int i, added = cpuc->n_added; if (!x86_pmu_initialized()) return; if (cpuc->enabled) return; if (cpuc->n_added) { int n_running = cpuc->n_events - cpuc->n_added; /* * The late setup (after counters are scheduled) * is required for some cases, e.g., PEBS counters * snapshotting. Because an accurate counter index * is needed. */ static_call_cond(x86_pmu_late_setup)(); /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() * * step1: save events moving to new counters */ for (i = 0; i < n_running; i++) { event = cpuc->event_list[i]; hwc = &event->hw; /* * we can avoid reprogramming counter if: * - assigned same counter as last time * - running on same CPU as last time * - no other event has used the counter since */ if (hwc->idx == -1 || match_prev_assignment(hwc, cpuc, i)) continue; /* * Ensure we don't accidentally enable a stopped * counter simply because we rescheduled. */ if (hwc->state & PERF_HES_STOPPED) hwc->state |= PERF_HES_ARCH; x86_pmu_stop(event, PERF_EF_UPDATE); } /* * step2: reprogram moved events into new counters */ for (i = 0; i < cpuc->n_events; i++) { event = cpuc->event_list[i]; hwc = &event->hw; if (!match_prev_assignment(hwc, cpuc, i)) x86_assign_hw_event(event, cpuc, i); else if (i < n_running) continue; if (hwc->state & PERF_HES_ARCH) continue; /* * if cpuc->enabled = 0, then no wrmsr as * per x86_pmu_enable_event() */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; perf_events_lapic_init(); } cpuc->enabled = 1; barrier(); static_call(x86_pmu_enable_all)(added); } DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. * To be called with the event disabled in hw: */ int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; if (unlikely(!hwc->event_base)) return 0; /* * If we are way outside a reasonable range then just skip forward: */ if (unlikely(left <= -period)) { left = period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } /* * Quirk: certain CPUs dont like it if just 1 hw_event is left: */ if (unlikely(left < 2)) left = 2; if (left > x86_pmu.max_period) left = x86_pmu.max_period; static_call_cond(x86_pmu_limit_period)(event, &left); this_cpu_write(pmc_prev_left[idx], left); /* * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ local64_set(&hwc->prev_count, (u64)-left); wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* * Sign extend the Merge event counter's upper 16 bits since * we currently declare a 48-bit counter width */ if (is_counter_pair(hwc)) wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); perf_event_update_userpage(event); return ret; } void x86_pmu_enable_event(struct perf_event *event) { if (__this_cpu_read(cpu_hw_events.enabled)) __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } /* * Add a single event to the PMU. * * The event is added to the group of enabled events * but only if it can be scheduled with existing events. */ static int x86_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc; int assign[X86_PMC_IDX_MAX]; int n, n0, ret; hwc = &event->hw; n0 = cpuc->n_events; ret = n = collect_events(cpuc, event, false); if (ret < 0) goto out; hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; if (!(flags & PERF_EF_START)) hwc->state |= PERF_HES_ARCH; /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole. * * If commit fails, we'll call ->del() on all events * for which ->add() was called. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto done_collect; ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) goto out; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); done_collect: /* * Commit the collect_events() state. See x86_pmu_del() and * x86_pmu_*_txn(). */ cpuc->n_events = n; cpuc->n_added += n - n0; cpuc->n_txn += n - n0; /* * This is before x86_pmu_enable() will call x86_pmu_start(), * so we enable LBRs before an event needs them etc.. */ static_call_cond(x86_pmu_add)(event); ret = 0; out: return ret; } static void x86_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx = event->hw.idx; if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) return; if (WARN_ON_ONCE(idx == -1)) return; if (flags & PERF_EF_RELOAD) { WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); static_call(x86_pmu_set_period)(event); } event->hw.state = 0; cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); } void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; unsigned long *cntr_mask, *fixed_cntr_mask; struct event_constraint *pebs_constraints; struct cpu_hw_events *cpuc; u64 pebs, debugctl; int cpu, idx; guard(irqsave)(); cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_events, cpu); cntr_mask = hybrid(cpuc->pmu, cntr_mask); fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask); pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); if (!*(u64 *)cntr_mask) return; if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); pr_info("\n"); pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); pr_info("CPU#%d: status: %016llx\n", cpu, status); pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); if (pebs_constraints) { rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); } if (x86_pmu.lbr_nr) { rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl); } } pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for_each_set_bit(idx, cntr_mask, X86_PMC_IDX_MAX) { rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); rdmsrl(x86_pmu_event_addr(idx), pmc_count); prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); pr_info("CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) { if (fixed_counter_disabled(idx, cpuc->pmu)) continue; rdmsrl(x86_pmu_fixed_ctr_addr(idx), pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } } void x86_pmu_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; if (test_bit(hwc->idx, cpuc->active_mask)) { static_call(x86_pmu_disable)(event); __clear_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { /* * Drain the remaining delta count out of a event * that we are disabling: */ static_call(x86_pmu_update)(event); hwc->state |= PERF_HES_UPTODATE; } } static void x86_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); int i; /* * If we're called during a txn, we only need to undo x86_pmu.add. * The events never got scheduled and ->cancel_txn will truncate * the event_list. * * XXX assumes any ->del() called during a TXN will only be on * an event added during that same TXN. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto do_del; __set_bit(event->hw.idx, cpuc->dirty); /* * Not a TXN, therefore cleanup properly. */ x86_pmu_stop(event, PERF_EF_UPDATE); for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) break; } if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ return; /* If we have a newly added event; make sure to decrease n_added. */ if (i >= cpuc->n_events - cpuc->n_added) --cpuc->n_added; static_call_cond(x86_pmu_put_event_constraints)(cpuc, event); /* Delete the array entry. */ while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; cpuc->assign[i-1] = cpuc->assign[i]; } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; if (intel_cap.perf_metrics) del_nr_metric_event(cpuc, event); perf_event_update_userpage(event); do_del: /* * This is after x86_pmu_stop(); so we disable LBRs after any * event can need them etc.. */ static_call_cond(x86_pmu_del)(event); } int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; struct perf_event *event; int idx, handled = 0; u64 val; cpuc = this_cpu_ptr(&cpu_hw_events); /* * Some chipsets need to unmask the LVTPC in a particular spot * inside the nmi handler. As a result, the unmasking was pushed * into all the nmi handlers. * * This generic handler doesn't seem to have any issues where the * unmasking occurs so it was left at the top. */ apic_write(APIC_LVTPC, APIC_DM_NMI); for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) { if (!test_bit(idx, cpuc->active_mask)) continue; event = cpuc->events[idx]; val = static_call(x86_pmu_update)(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) continue; /* * event overflow */ handled++; if (!static_call(x86_pmu_set_period)(event)) continue; perf_sample_data_init(&data, 0, event->hw.last_period); perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL); if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } if (handled) inc_irq_stat(apic_perf_irqs); return handled; } void perf_events_lapic_init(void) { if (!x86_pmu.apic || !x86_pmu_initialized()) return; /* * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); } static int perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { u64 start_clock; u64 finish_clock; int ret; /* * All PMUs/events that share this PMI handler should make sure to * increment active_events for their events. */ if (!atomic_read(&active_events)) return NMI_DONE; start_clock = sched_clock(); ret = static_call(x86_pmu_handle_irq)(regs); finish_clock = sched_clock(); perf_sample_event_took(finish_clock - start_clock); return ret; } NOKPROBE_SYMBOL(perf_event_nmi_handler); struct event_constraint emptyconstraint; struct event_constraint unconstrained; static int x86_pmu_prepare_cpu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int i; for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) cpuc->kfree_on_online[i] = NULL; if (x86_pmu.cpu_prepare) return x86_pmu.cpu_prepare(cpu); return 0; } static int x86_pmu_dead_cpu(unsigned int cpu) { if (x86_pmu.cpu_dead) x86_pmu.cpu_dead(cpu); return 0; } static int x86_pmu_online_cpu(unsigned int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int i; for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) { kfree(cpuc->kfree_on_online[i]); cpuc->kfree_on_online[i] = NULL; } return 0; } static int x86_pmu_starting_cpu(unsigned int cpu) { if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); return 0; } static int x86_pmu_dying_cpu(unsigned int cpu) { if (x86_pmu.cpu_dying) x86_pmu.cpu_dying(cpu); return 0; } static void __init pmu_check_apic(void) { if (boot_cpu_has(X86_FEATURE_APIC)) return; x86_pmu.apic = 0; pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); pr_info("no hardware sampling interrupt available.\n"); /* * If we have a PMU initialized but no APIC * interrupts, we cannot sample hardware * events (user-space has to fall back and * sample via a hrtimer based software event): */ pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; } static struct attribute_group x86_pmu_format_group __ro_after_init = { .name = "format", .attrs = NULL, }; ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); u64 config = 0; if (pmu_attr->id < x86_pmu.max_events) config = x86_pmu.event_map(pmu_attr->id); /* string trumps id */ if (pmu_attr->event_str) return sprintf(page, "%s\n", pmu_attr->event_str); return x86_pmu.events_sysfs_show(page, config); } EXPORT_SYMBOL_GPL(events_sysfs_show); ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_ht_attr *pmu_attr = container_of(attr, struct perf_pmu_events_ht_attr, attr); /* * Report conditional events depending on Hyper-Threading. * * This is overly conservative as usually the HT special * handling is not needed if the other CPU thread is idle. * * Note this does not (and cannot) handle the case when thread * siblings are invisible, for example with virtualization * if they are owned by some other guest. The user tool * has to re-read when a thread sibling gets onlined later. */ return sprintf(page, "%s", topology_max_smt_threads() > 1 ? pmu_attr->event_str_ht : pmu_attr->event_str_noht); } ssize_t events_hybrid_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_hybrid_attr *pmu_attr = container_of(attr, struct perf_pmu_events_hybrid_attr, attr); struct x86_hybrid_pmu *pmu; const char *str, *next_str; int i; if (hweight64(pmu_attr->pmu_type) == 1) return sprintf(page, "%s", pmu_attr->event_str); /* * Hybrid PMUs may support the same event name, but with different * event encoding, e.g., the mem-loads event on an Atom PMU has * different event encoding from a Core PMU. * * The event_str includes all event encodings. Each event encoding * is divided by ";". The order of the event encodings must follow * the order of the hybrid PMU index. */ pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); str = pmu_attr->event_str; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type)) continue; if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) { next_str = strchr(str, ';'); if (next_str) return snprintf(page, next_str - str + 1, "%s", str); else return sprintf(page, "%s", str); } str = strchr(str, ';'); str++; } return 0; } EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show); EVENT_ATTR(cpu-cycles, CPU_CYCLES ); EVENT_ATTR(instructions, INSTRUCTIONS ); EVENT_ATTR(cache-references, CACHE_REFERENCES ); EVENT_ATTR(cache-misses, CACHE_MISSES ); EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); EVENT_ATTR(branch-misses, BRANCH_MISSES ); EVENT_ATTR(bus-cycles, BUS_CYCLES ); EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); static struct attribute *empty_attrs; static struct attribute *events_attr[] = { EVENT_PTR(CPU_CYCLES), EVENT_PTR(INSTRUCTIONS), EVENT_PTR(CACHE_REFERENCES), EVENT_PTR(CACHE_MISSES), EVENT_PTR(BRANCH_INSTRUCTIONS), EVENT_PTR(BRANCH_MISSES), EVENT_PTR(BUS_CYCLES), EVENT_PTR(STALLED_CYCLES_FRONTEND), EVENT_PTR(STALLED_CYCLES_BACKEND), EVENT_PTR(REF_CPU_CYCLES), NULL, }; /* * Remove all undefined events (x86_pmu.event_map(id) == 0) * out of events_attr attributes. */ static umode_t is_visible(struct kobject *kobj, struct attribute *attr, int idx) { struct perf_pmu_events_attr *pmu_attr; if (idx >= x86_pmu.max_events) return 0; pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); /* str trumps id */ return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0; } static struct attribute_group x86_pmu_events_group __ro_after_init = { .name = "events", .attrs = events_attr, .is_visible = is_visible, }; ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) { u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); ssize_t ret; /* * We have whole page size to spend and just little data * to write, so we can safely use sprintf. */ ret = sprintf(page, "event=0x%02llx", event); if (umask) ret += sprintf(page + ret, ",umask=0x%02llx", umask); if (edge) ret += sprintf(page + ret, ",edge"); if (pc) ret += sprintf(page + ret, ",pc"); if (any) ret += sprintf(page + ret, ",any"); if (inv) ret += sprintf(page + ret, ",inv"); if (cmask) ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); ret += sprintf(page + ret, "\n"); return ret; } static struct attribute_group x86_pmu_attr_group; static struct attribute_group x86_pmu_caps_group; static void x86_pmu_static_call_update(void) { static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq); static_call_update(x86_pmu_disable_all, x86_pmu.disable_all); static_call_update(x86_pmu_enable_all, x86_pmu.enable_all); static_call_update(x86_pmu_enable, x86_pmu.enable); static_call_update(x86_pmu_disable, x86_pmu.disable); static_call_update(x86_pmu_assign, x86_pmu.assign); static_call_update(x86_pmu_add, x86_pmu.add); static_call_update(x86_pmu_del, x86_pmu.del); static_call_update(x86_pmu_read, x86_pmu.read); static_call_update(x86_pmu_set_period, x86_pmu.set_period); static_call_update(x86_pmu_update, x86_pmu.update); static_call_update(x86_pmu_limit_period, x86_pmu.limit_period); static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling); static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling); static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling); static_call_update(x86_pmu_sched_task, x86_pmu.sched_task); static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); static_call_update(x86_pmu_filter, x86_pmu.filter); static_call_update(x86_pmu_late_setup, x86_pmu.late_setup); } static void _x86_pmu_read(struct perf_event *event) { static_call(x86_pmu_update)(event); } void x86_pmu_show_pmu_cap(struct pmu *pmu) { pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu)); pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); pr_info("... max period: %016Lx\n", x86_pmu.max_period); pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu)); pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl)); } static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; int err; pr_info("Performance Events: "); switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: err = intel_pmu_init(); break; case X86_VENDOR_AMD: err = amd_pmu_init(); break; case X86_VENDOR_HYGON: err = amd_pmu_init(); x86_pmu.name = "HYGON"; break; case X86_VENDOR_ZHAOXIN: case X86_VENDOR_CENTAUR: err = zhaoxin_pmu_init(); break; default: err = -ENOTSUPP; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); err = 0; goto out_bad_pmu; } pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ if (!check_hw_exists(&pmu, x86_pmu.cntr_mask, x86_pmu.fixed_cntr_mask)) goto out_bad_pmu; pr_cont("%s PMU driver.\n", x86_pmu.name); x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); if (!x86_pmu.intel_ctrl) x86_pmu.intel_ctrl = x86_pmu.cntr_mask64; if (!x86_pmu.config_mask) x86_pmu.config_mask = X86_RAW_EVENT_MASK; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, x86_pmu.cntr_mask64, 0, x86_pmu_num_counters(NULL), 0, 0); x86_pmu_format_group.attrs = x86_pmu.format_attrs; if (!x86_pmu.events_sysfs_show) x86_pmu_events_group.attrs = &empty_attrs; pmu.attr_update = x86_pmu.attr_update; if (!is_hybrid()) x86_pmu_show_pmu_cap(NULL); if (!x86_pmu.read) x86_pmu.read = _x86_pmu_read; if (!x86_pmu.guest_get_msrs) x86_pmu.guest_get_msrs = (void *)&__static_call_return0; if (!x86_pmu.set_period) x86_pmu.set_period = x86_perf_event_set_period; if (!x86_pmu.update) x86_pmu.update = x86_perf_event_update; x86_pmu_static_call_update(); /* * Install callbacks. Core will call them for each online * cpu. */ err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare", x86_pmu_prepare_cpu, x86_pmu_dead_cpu); if (err) return err; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING, "perf/x86:starting", x86_pmu_starting_cpu, x86_pmu_dying_cpu); if (err) goto out; err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online", x86_pmu_online_cpu, NULL); if (err) goto out1; if (!is_hybrid()) { err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); if (err) goto out2; } else { struct x86_hybrid_pmu *hybrid_pmu; int i, j; for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { hybrid_pmu = &x86_pmu.hybrid_pmu[i]; hybrid_pmu->pmu = pmu; hybrid_pmu->pmu.type = -1; hybrid_pmu->pmu.attr_update = x86_pmu.attr_update; hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, (hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1); if (err) break; } if (i < x86_pmu.num_hybrid_pmus) { for (j = 0; j < i; j++) perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu); pr_warn("Failed to register hybrid PMUs\n"); kfree(x86_pmu.hybrid_pmu); x86_pmu.hybrid_pmu = NULL; x86_pmu.num_hybrid_pmus = 0; goto out2; } } return 0; out2: cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE); out1: cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); out: cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); out_bad_pmu: memset(&x86_pmu, 0, sizeof(x86_pmu)); return err; } early_initcall(init_hw_perf_events); static void x86_pmu_read(struct perf_event *event) { static_call(x86_pmu_read)(event); } /* * Start group events scheduling transaction * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time * * We only support PERF_PMU_TXN_ADD transactions. Save the * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD * transactions. */ static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */ cpuc->txn_flags = txn_flags; if (txn_flags & ~PERF_PMU_TXN_ADD) return; perf_pmu_disable(pmu); __this_cpu_write(cpu_hw_events.n_txn, 0); __this_cpu_write(cpu_hw_events.n_txn_pair, 0); __this_cpu_write(cpu_hw_events.n_txn_metric, 0); } /* * Stop group events scheduling transaction * Clear the flag and pmu::enable() will perform the * schedulability test. */ static void x86_pmu_cancel_txn(struct pmu *pmu) { unsigned int txn_flags; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ txn_flags = cpuc->txn_flags; cpuc->txn_flags = 0; if (txn_flags & ~PERF_PMU_TXN_ADD) return; /* * Truncate collected array by the number of events added in this * transaction. See x86_pmu_add() and x86_pmu_*_txn(). */ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair)); __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric)); perf_pmu_enable(pmu); } /* * Commit group events scheduling transaction * Perform the group schedulability test as a whole * Return 0 if success * * Does not cancel the transaction on failure; expects the caller to do this. */ static int x86_pmu_commit_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int assign[X86_PMC_IDX_MAX]; int n, ret; WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */ if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) { cpuc->txn_flags = 0; return 0; } n = cpuc->n_events; if (!x86_pmu_initialized()) return -EAGAIN; ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) return ret; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->txn_flags = 0; perf_pmu_enable(pmu); return 0; } /* * a fake_cpuc is used to validate event groups. Due to * the extra reg logic, we need to also allocate a fake * per_core and per_cpu structure. Otherwise, group events * using extra reg may conflict without the kernel being * able to catch this when the last event gets added to * the group. */ static void free_fake_cpuc(struct cpu_hw_events *cpuc) { intel_cpuc_finish(cpuc); kfree(cpuc); } static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu) { struct cpu_hw_events *cpuc; int cpu; cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); if (!cpuc) return ERR_PTR(-ENOMEM); cpuc->is_fake = 1; if (is_hybrid()) { struct x86_hybrid_pmu *h_pmu; h_pmu = hybrid_pmu(event_pmu); if (cpumask_empty(&h_pmu->supported_cpus)) goto error; cpu = cpumask_first(&h_pmu->supported_cpus); } else cpu = raw_smp_processor_id(); cpuc->pmu = event_pmu; if (intel_cpuc_prepare(cpuc, cpu)) goto error; return cpuc; error: free_fake_cpuc(cpuc); return ERR_PTR(-ENOMEM); } /* * validate that we can schedule this event */ static int validate_event(struct perf_event *event) { struct cpu_hw_events *fake_cpuc; struct event_constraint *c; int ret = 0; fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); c = x86_pmu.get_event_constraints(fake_cpuc, 0, event); if (!c || !c->weight) ret = -EINVAL; if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(fake_cpuc, event); free_fake_cpuc(fake_cpuc); return ret; } /* * validate a single event group * * validation include: * - check events are compatible which each other * - events do not compete for the same counter * - number of events <= number of counters * * validation ensures the group can be loaded onto the * PMU if it was the only group available. */ static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct cpu_hw_events *fake_cpuc; int ret = -EINVAL, n; /* * Reject events from different hybrid PMUs. */ if (is_hybrid()) { struct perf_event *sibling; struct pmu *pmu = NULL; if (is_x86_event(leader)) pmu = leader->pmu; for_each_sibling_event(sibling, leader) { if (!is_x86_event(sibling)) continue; if (!pmu) pmu = sibling->pmu; else if (pmu != sibling->pmu) return ret; } } fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); /* * the event is not yet connected with its * siblings therefore we must first collect * existing siblings, then add the new event * before we can simulate the scheduling */ n = collect_events(fake_cpuc, leader, true); if (n < 0) goto out; fake_cpuc->n_events = n; n = collect_events(fake_cpuc, event, false); if (n < 0) goto out; fake_cpuc->n_events = 0; ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); out: free_fake_cpuc(fake_cpuc); return ret; } static int x86_pmu_event_init(struct perf_event *event) { struct x86_hybrid_pmu *pmu = NULL; int err; if ((event->attr.type != event->pmu->type) && (event->attr.type != PERF_TYPE_HARDWARE) && (event->attr.type != PERF_TYPE_HW_CACHE)) return -ENOENT; if (is_hybrid() && (event->cpu != -1)) { pmu = hybrid_pmu(event->pmu); if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus)) return -ENOENT; } err = __x86_pmu_event_init(event); if (!err) { if (event->group_leader != event) err = validate_group(event); else err = validate_event(event); } if (err) { if (event->destroy) event->destroy(event); event->destroy = NULL; } if (READ_ONCE(x86_pmu.attr_rdpmc) && !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; return err; } void perf_clear_dirty_counters(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int i; /* Don't need to clear the assigned counter. */ for (i = 0; i < cpuc->n_events; i++) __clear_bit(cpuc->assign[i], cpuc->dirty); if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX)) return; for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) { if (i >= INTEL_PMC_IDX_FIXED) { /* Metrics and fake events don't have corresponding HW counters. */ if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask))) continue; wrmsrl(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0); } else { wrmsrl(x86_pmu_event_addr(i), 0); } } bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX); } static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; /* * This function relies on not being called concurrently in two * tasks in the same mm. Otherwise one task could observe * perf_rdpmc_allowed > 1 and return all the way back to * userspace with CR4.PCE clear while another task is still * doing on_each_cpu_mask() to propagate CR4.PCE. * * For now, this can't happen because all callers hold mmap_lock * for write. If this changes, we'll need a different solution. */ mmap_assert_write_locked(mm); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) return 0; if (is_metric_idx(hwc->idx)) return INTEL_PMC_FIXED_RDPMC_METRICS + 1; else return hwc->event_base_rdpmc + 1; } static ssize_t get_attr_rdpmc(struct device *cdev, struct device_attribute *attr, char *buf) { return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); } static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) { static DEFINE_MUTEX(rdpmc_mutex); unsigned long val; ssize_t ret; ret = kstrtoul(buf, 0, &val); if (ret) return ret; if (val > 2) return -EINVAL; if (x86_pmu.attr_rdpmc_broken) return -ENOTSUPP; guard(mutex)(&rdpmc_mutex); if (val != x86_pmu.attr_rdpmc) { /* * Changing into or out of never available or always available, * aka perf-event-bypassing mode. This path is extremely slow, * but only root can trigger it, so it's okay. */ if (val == 0) static_branch_inc(&rdpmc_never_available_key); else if (x86_pmu.attr_rdpmc == 0) static_branch_dec(&rdpmc_never_available_key); if (val == 2) static_branch_inc(&rdpmc_always_available_key); else if (x86_pmu.attr_rdpmc == 2) static_branch_dec(&rdpmc_always_available_key); on_each_cpu(cr4_update_pce, NULL, 1); x86_pmu.attr_rdpmc = val; } return count; } static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); static struct attribute *x86_pmu_attrs[] = { &dev_attr_rdpmc.attr, NULL, }; static struct attribute_group x86_pmu_attr_group __ro_after_init = { .attrs = x86_pmu_attrs, }; static ssize_t max_precise_show(struct device *cdev, struct device_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise()); } static DEVICE_ATTR_RO(max_precise); static struct attribute *x86_pmu_caps_attrs[] = { &dev_attr_max_precise.attr, NULL }; static struct attribute_group x86_pmu_caps_group __ro_after_init = { .name = "caps", .attrs = x86_pmu_caps_attrs, }; static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, &x86_pmu_events_group, &x86_pmu_caps_group, NULL, }; static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, struct task_struct *task, bool sched_in) { static_call_cond(x86_pmu_sched_task)(pmu_ctx, task, sched_in); } void perf_check_microcode(void) { if (x86_pmu.check_microcode) x86_pmu.check_microcode(); } static int x86_pmu_check_period(struct perf_event *event, u64 value) { if (x86_pmu.check_period && x86_pmu.check_period(event, value)) return -EINVAL; if (value && x86_pmu.limit_period) { s64 left = value; x86_pmu.limit_period(event, &left); if (left > value) return -EINVAL; } return 0; } static int x86_pmu_aux_output_match(struct perf_event *event) { if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT)) return 0; if (x86_pmu.aux_output_match) return x86_pmu.aux_output_match(event); return 0; } static bool x86_pmu_filter(struct pmu *pmu, int cpu) { bool ret = false; static_call_cond(x86_pmu_filter)(pmu, cpu, &ret); return ret; } static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, .attr_groups = x86_pmu_attr_groups, .event_init = x86_pmu_event_init, .event_mapped = x86_pmu_event_mapped, .event_unmapped = x86_pmu_event_unmapped, .add = x86_pmu_add, .del = x86_pmu_del, .start = x86_pmu_start, .stop = x86_pmu_stop, .read = x86_pmu_read, .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, .check_period = x86_pmu_check_period, .aux_output_match = x86_pmu_aux_output_match, .filter = x86_pmu_filter, }; void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { struct cyc2ns_data data; u64 offset; userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); userpg->pmc_width = x86_pmu.cntval_bits; if (!using_native_sched_clock() || !sched_clock_stable()) return; cyc2ns_read_begin(&data); offset = data.cyc2ns_offset + __sched_clock_offset; /* * Internal timekeeping for enabled/running/stopped times * is always in the local_clock domain. */ userpg->cap_user_time = 1; userpg->time_mult = data.cyc2ns_mul; userpg->time_shift = data.cyc2ns_shift; userpg->time_offset = offset - now; /* * cap_user_time_zero doesn't make sense when we're using a different * time base for the records. */ if (!event->attr.use_clockid) { userpg->cap_user_time_zero = 1; userpg->time_zero = offset; } cyc2ns_read_end(); } /* * Determine whether the regs were taken from an irq/exception handler rather * than from perf_arch_fetch_caller_regs(). */ static bool perf_hw_regs(struct pt_regs *regs) { return regs->flags & X86_EFLAGS_FIXED; } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct unwind_state state; unsigned long addr; if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } if (perf_callchain_store(entry, regs->ip)) return; if (perf_hw_regs(regs)) unwind_start(&state, current, regs, NULL); else unwind_start(&state, current, NULL, (void *)regs->sp); for (; !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr || perf_callchain_store(entry, addr)) return; } } static inline int valid_user_frame(const void __user *fp, unsigned long size) { return __access_ok(fp, size); } static unsigned long get_segment_base(unsigned int segment) { struct desc_struct *desc; unsigned int idx = segment >> 3; if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; /* IRQs are off, so this synchronizes with smp_store_release */ ldt = READ_ONCE(current->active_mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; desc = &ldt->entries[idx]; #else return 0; #endif } else { if (idx >= GDT_ENTRIES) return 0; desc = raw_cpu_ptr(gdt_page.gdt) + idx; } return get_desc_base(desc); } #ifdef CONFIG_UPROBES /* * Heuristic-based check if uprobe is installed at the function entry. * * Under assumption of user code being compiled with frame pointers, * `push %rbp/%ebp` is a good indicator that we indeed are. * * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern. * If we get this wrong, captured stack trace might have one extra bogus * entry, but the rest of stack trace will still be meaningful. */ static bool is_uprobe_at_func_entry(struct pt_regs *regs) { struct arch_uprobe *auprobe; if (!current->utask) return false; auprobe = current->utask->auprobe; if (!auprobe) return false; /* push %rbp/%ebp */ if (auprobe->insn[0] == 0x55) return true; /* endbr64 (64-bit only) */ if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) return true; return false; } #else static bool is_uprobe_at_func_entry(struct pt_regs *regs) { return false; } #endif /* CONFIG_UPROBES */ #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { /* 32-bit process in 64-bit kernel. */ unsigned long ss_base, cs_base; struct stack_frame_ia32 frame; const struct stack_frame_ia32 __user *fp; u32 ret_addr; if (user_64bit_mode(regs)) return 0; cs_base = get_segment_base(regs->cs); ss_base = get_segment_base(regs->ss); fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); /* see perf_callchain_user() below for why we do this */ if (is_uprobe_at_func_entry(regs) && !get_user(ret_addr, (const u32 __user *)regs->sp)) perf_callchain_store(entry, ret_addr); while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; if (__get_user(frame.next_frame, &fp->next_frame)) break; if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, cs_base + frame.return_address); fp = compat_ptr(ss_base + frame.next_frame); } pagefault_enable(); return 1; } #else static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry) { return 0; } #endif void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stack_frame frame; const struct stack_frame __user *fp; unsigned long ret_addr; if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } /* * We don't know what to do with VM86 stacks.. ignore them for now. */ if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) return; fp = (void __user *)regs->bp; perf_callchain_store(entry, regs->ip); if (!nmi_uaccess_okay()) return; if (perf_callchain_user32(regs, entry)) return; pagefault_disable(); /* * If we are called from uprobe handler, and we are indeed at the very * entry to user function (which is normally a `push %rbp` instruction, * under assumption of application being compiled with frame pointers), * we should read return address from *regs->sp before proceeding * to follow frame pointers, otherwise we'll skip immediate caller * as %rbp is not yet setup. */ if (is_uprobe_at_func_entry(regs) && !get_user(ret_addr, (const unsigned long __user *)regs->sp)) perf_callchain_store(entry, ret_addr); while (entry->nr < entry->max_stack) { if (!valid_user_frame(fp, sizeof(frame))) break; if (__get_user(frame.next_frame, &fp->next_frame)) break; if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, frame.return_address); fp = (void __user *)frame.next_frame; } pagefault_enable(); } /* * Deal with code segment offsets for the various execution modes: * * VM86 - the good olde 16 bit days, where the linear address is * 20 bits and we use regs->ip + 0x10 * regs->cs. * * IA32 - Where we need to look at GDT/LDT segment descriptor tables * to figure out what the 32bit base address is. * * X32 - has TIF_X32 set, but is running in x86_64 * * X86_64 - CS,DS,SS,ES are all zero based. */ static unsigned long code_segment_base(struct pt_regs *regs) { /* * For IA32 we look at the GDT/LDT segment base to convert the * effective IP to a linear address. */ #ifdef CONFIG_X86_32 /* * If we are in VM86 mode, add the segment offset to convert to a * linear address. */ if (regs->flags & X86_VM_MASK) return 0x10 * regs->cs; if (user_mode(regs) && regs->cs != __USER_CS) return get_segment_base(regs->cs); #else if (user_mode(regs) && !user_64bit_mode(regs) && regs->cs != __USER32_CS) return get_segment_base(regs->cs); #endif return 0; } unsigned long perf_arch_instruction_pointer(struct pt_regs *regs) { return regs->ip + code_segment_base(regs); } static unsigned long common_misc_flags(struct pt_regs *regs) { if (regs->flags & PERF_EFLAGS_EXACT) return PERF_RECORD_MISC_EXACT_IP; return 0; } static unsigned long guest_misc_flags(struct pt_regs *regs) { unsigned long guest_state = perf_guest_state(); if (!(guest_state & PERF_GUEST_ACTIVE)) return 0; if (guest_state & PERF_GUEST_USER) return PERF_RECORD_MISC_GUEST_USER; else return PERF_RECORD_MISC_GUEST_KERNEL; } static unsigned long host_misc_flags(struct pt_regs *regs) { if (user_mode(regs)) return PERF_RECORD_MISC_USER; else return PERF_RECORD_MISC_KERNEL; } unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) { unsigned long flags = common_misc_flags(regs); flags |= guest_misc_flags(regs); return flags; } unsigned long perf_arch_misc_flags(struct pt_regs *regs) { unsigned long flags = common_misc_flags(regs); flags |= host_misc_flags(regs); return flags; } void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { /* This API doesn't currently support enumerating hybrid PMUs. */ if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) || !x86_pmu_initialized()) { memset(cap, 0, sizeof(*cap)); return; } /* * Note, hybrid CPU models get tracked as having hybrid PMUs even when * all E-cores are disabled via BIOS. When E-cores are disabled, the * base PMU holds the correct number of counters for P-cores. */ cap->version = x86_pmu.version; cap->num_counters_gp = x86_pmu_num_counters(NULL); cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL); cap->bit_width_gp = x86_pmu.cntval_bits; cap->bit_width_fixed = x86_pmu.cntval_bits; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; } EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); u64 perf_get_hw_event_config(int hw_event) { int max = x86_pmu.max_events; if (hw_event < max) return x86_pmu.event_map(array_index_nospec(hw_event, max)); return 0; } EXPORT_SYMBOL_GPL(perf_get_hw_event_config); |
4103 4104 4101 4101 4104 4101 4091 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | // SPDX-License-Identifier: GPL-2.0 /* * SHA1 routine optimized to do word accesses rather than byte accesses, * and to avoid unnecessary copies into the context array. * * This was based on the git SHA1 implementation. */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/string.h> #include <crypto/sha1.h> #include <linux/unaligned.h> /* * If you have 32 registers or more, the compiler can (and should) * try to change the array[] accesses into registers. However, on * machines with less than ~25 registers, that won't really work, * and at least gcc will make an unholy mess of it. * * So to avoid that mess which just slows things down, we force * the stores to memory to actually happen (we might be better off * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as * suggested by Artur Skawina - that will also make gcc unable to * try to do the silly "optimize away loads" part because it won't * see what the value will be). * * Ben Herrenschmidt reports that on PPC, the C version comes close * to the optimized asm with this (ie on PPC you don't want that * 'volatile', since there are lots of registers). * * On ARM we get the best code generation by forcing a full memory barrier * between each SHA_ROUND, otherwise gcc happily get wild with spilling and * the stack frame size simply explode and performance goes down the drain. */ #ifdef CONFIG_X86 #define setW(x, val) (*(volatile __u32 *)&W(x) = (val)) #elif defined(CONFIG_ARM) #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0) #else #define setW(x, val) (W(x) = (val)) #endif /* This "rolls" over the 512-bit array */ #define W(x) (array[(x)&15]) /* * Where do we get the source from? The first 16 iterations get it from * the input data, the next mix it from the 512-bit array. */ #define SHA_SRC(t) get_unaligned_be32((__u32 *)data + t) #define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1) #define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \ __u32 TEMP = input(t); setW(t, TEMP); \ E += TEMP + rol32(A,5) + (fn) + (constant); \ B = ror32(B, 2); \ TEMP = E; E = D; D = C; C = B; B = A; A = TEMP; } while (0) #define T_0_15(t, A, B, C, D, E) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) #define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) #define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E ) #define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E ) #define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6, A, B, C, D, E ) /** * sha1_transform - single block SHA1 transform (deprecated) * * @digest: 160 bit digest to update * @data: 512 bits of data to hash * @array: 16 words of workspace (see note) * * This function executes SHA-1's internal compression function. It updates the * 160-bit internal state (@digest) with a single 512-bit data block (@data). * * Don't use this function. SHA-1 is no longer considered secure. And even if * you do have to use SHA-1, this isn't the correct way to hash something with * SHA-1 as this doesn't handle padding and finalization. * * Note: If the hash is security sensitive, the caller should be sure * to clear the workspace. This is left to the caller to avoid * unnecessary clears between chained hashing operations. */ void sha1_transform(__u32 *digest, const char *data, __u32 *array) { __u32 A, B, C, D, E; unsigned int i = 0; A = digest[0]; B = digest[1]; C = digest[2]; D = digest[3]; E = digest[4]; /* Round 1 - iterations 0-16 take their input from 'data' */ for (; i < 16; ++i) T_0_15(i, A, B, C, D, E); /* Round 1 - tail. Input from 512-bit mixing array */ for (; i < 20; ++i) T_16_19(i, A, B, C, D, E); /* Round 2 */ for (; i < 40; ++i) T_20_39(i, A, B, C, D, E); /* Round 3 */ for (; i < 60; ++i) T_40_59(i, A, B, C, D, E); /* Round 4 */ for (; i < 80; ++i) T_60_79(i, A, B, C, D, E); digest[0] += A; digest[1] += B; digest[2] += C; digest[3] += D; digest[4] += E; } EXPORT_SYMBOL(sha1_transform); /** * sha1_init - initialize the vectors for a SHA1 digest * @buf: vector to initialize */ void sha1_init(__u32 *buf) { buf[0] = 0x67452301; buf[1] = 0xefcdab89; buf[2] = 0x98badcfe; buf[3] = 0x10325476; buf[4] = 0xc3d2e1f0; } EXPORT_SYMBOL(sha1_init); MODULE_DESCRIPTION("SHA-1 Algorithm"); MODULE_LICENSE("GPL"); |
6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #include <linux/types.h> #include <linux/bpf.h> #include <linux/bpf_local_storage.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> DEFINE_BPF_STORAGE_CACHE(cgroup_cache); static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); static void bpf_cgrp_storage_lock(void) { cant_migrate(); this_cpu_inc(bpf_cgrp_storage_busy); } static void bpf_cgrp_storage_unlock(void) { this_cpu_dec(bpf_cgrp_storage_busy); } static bool bpf_cgrp_storage_trylock(void) { cant_migrate(); if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { this_cpu_dec(bpf_cgrp_storage_busy); return false; } return true; } static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) { struct cgroup *cg = owner; return &cg->bpf_cgrp_storage; } void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; migrate_disable(); rcu_read_lock(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); if (!local_storage) goto out; bpf_cgrp_storage_lock(); bpf_local_storage_destroy(local_storage); bpf_cgrp_storage_unlock(); out: rcu_read_unlock(); migrate_enable(); } static struct bpf_local_storage_data * cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *cgroup_storage; struct bpf_local_storage_map *smap; cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage, bpf_rcu_lock_held()); if (!cgroup_storage) return NULL; smap = (struct bpf_local_storage_map *)map; return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit); } static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; int fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return ERR_CAST(cgroup); bpf_cgrp_storage_lock(); sdata = cgroup_storage_lookup(cgroup, map, true); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return sdata ? sdata->data : NULL; } static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; int fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, map_flags, false, GFP_ATOMIC); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); } static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) { struct bpf_local_storage_data *sdata; sdata = cgroup_storage_lookup(cgroup, map, false); if (!sdata) return -ENOENT; bpf_selem_unlink(SELEM(sdata), false); return 0; } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) { struct cgroup *cgroup; int err, fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); bpf_cgrp_storage_lock(); err = cgroup_storage_delete(cgroup, map); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return err; } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { return bpf_local_storage_map_alloc(attr, &cgroup_cache, true); } static void cgroup_storage_map_free(struct bpf_map *map) { bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy); } /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; bool nobusy; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; if (!cgroup) return (unsigned long)NULL; nobusy = bpf_cgrp_storage_trylock(); sdata = cgroup_storage_lookup(cgroup, map, nobusy); if (sdata) goto unlock; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); unlock: if (nobusy) bpf_cgrp_storage_unlock(); return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup) { int ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!cgroup) return -EINVAL; if (!bpf_cgrp_storage_trylock()) return -EBUSY; ret = cgroup_storage_delete(cgroup, map); bpf_cgrp_storage_unlock(); return ret; } const struct bpf_map_ops cgrp_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_cgrp_storage_lookup_elem, .map_update_elem = bpf_cgrp_storage_update_elem, .map_delete_elem = bpf_cgrp_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, .map_mem_usage = bpf_local_storage_map_mem_usage, .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = cgroup_storage_ptr, }; const struct bpf_func_proto bpf_cgrp_storage_get_proto = { .func = bpf_cgrp_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_cgrp_storage_delete_proto = { .func = bpf_cgrp_storage_delete, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], }; |
500 1093 1 1092 1251 20 1249 1250 1097 1095 1097 1094 3 259 1024 1247 352 352 17 335 334 289 46 333 195 114 25 1 335 352 84 44 40 84 20 65 66 42 1 13 11 25 29 1 1 28 3 3 16 29 17 3 131 2 173 8 8 31 451 463 463 10 489 2 10 481 138 48 325 480 1 106 402 2 477 137 371 137 81 304 1 5 10 6 466 466 3 462 462 1 131 361 462 462 461 1 112 112 7 109 112 349 397 37 40 112 349 398 77 262 199 461 310 151 2 418 41 229 77 169 3 99 3 381 113 318 431 67 817 4 1 80 724 28 694 8 33 23 9 32 1469 1469 1469 1467 1469 1468 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/ialloc.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * BSD ufs-inspired inode and directory allocation by * Stephen Tweedie (sct@redhat.com), 1993 * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/time.h> #include <linux/fs.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/random.h> #include <linux/bitops.h> #include <linux/blkdev.h> #include <linux/cred.h> #include <asm/byteorder.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include <trace/events/ext4.h> /* * ialloc.c contains the inodes allocation and deallocation routines */ /* * The free inodes are managed by bitmaps. A file system contains several * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap * block for inodes, N blocks for the inode table and data blocks. * * The file system contains group descriptors which are located after the * super block. Each descriptor contains the number of the bitmap block and * the free blocks count in the block. */ /* * To avoid calling the atomic setbit hundreds or thousands of times, we only * need to use it within a single byte (to ensure we get endianness right). * We can use memset for the rest of the bitmap as there are no other users. */ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) { int i; if (start_bit >= end_bit) return; ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) ext4_set_bit(i, bitmap); if (i < end_bit) memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); } void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) { if (uptodate) { set_buffer_uptodate(bh); set_bitmap_uptodate(bh); } unlock_buffer(bh); put_bh(bh); } static int ext4_validate_inode_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp; if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return 0; if (buffer_verified(bh)) return 0; grp = ext4_get_group_info(sb, block_group); if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; blk = ext4_inode_bitmap(sb, desc); if (!ext4_inode_bitmap_csum_verify(sb, desc, bh) || ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group); return 0; } /* * Read the inode allocation bitmap for a given block_group, reading * into the specified slot in the superblock's bitmap cache. * * Return buffer_head of bitmap on success, or an ERR_PTR on error. */ static struct buffer_head * ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; int err; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_inode_bitmap(sb, desc); if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid inode bitmap blk %llu in " "block_group %u", bitmap_blk, block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_warning(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return ERR_PTR(-ENOMEM); } if (bitmap_uptodate(bh)) goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); goto verify; } ext4_lock_group(sb, block_group); if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { if (block_group == 0) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Inode bitmap for bg 0 marked " "uninitialized"); err = -EFSCORRUPTED; goto out; } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, * bitmap is also uptodate */ set_bitmap_uptodate(bh); unlock_buffer(bh); goto verify; } /* * submit the buffer_head for reading */ trace_ext4_load_inode_bitmap(sb, block_group); ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read, ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_EIO)); if (!buffer_uptodate(bh)) { put_bh(bh); ext4_error_err(sb, EIO, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EIO); } verify: err = ext4_validate_inode_bitmap(sb, desc, block_group, bh); if (err) goto out; return bh; out: put_bh(bh); return ERR_PTR(err); } /* * NOTE! When we get the inode, we're the only people * that have access to it, and as such there are no * race conditions we have to worry about. The inode * is not on the hash-lists, and it cannot be reached * through the filesystem because the directory entry * has been deleted earlier. * * HOWEVER: we must make sure that we get no aliases, * which means that we have to call "clear_inode()" * _before_ we mark the inode not in use in the inode * bitmaps. Otherwise a newly created file might use * the same inode number (not actually the same pointer * though), and then we'd have two inodes sharing the * same inode number and space on the harddisk. */ void ext4_free_inode(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; int is_directory; unsigned long ino; struct buffer_head *bitmap_bh = NULL; struct buffer_head *bh2; ext4_group_t block_group; unsigned long bit; struct ext4_group_desc *gdp; struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err, count, cleared; struct ext4_group_info *grp; if (!sb) { printk(KERN_ERR "EXT4-fs: %s:%d: inode on " "nonexistent device\n", __func__, __LINE__); return; } if (atomic_read(&inode->i_count) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", __func__, __LINE__, inode->i_ino, atomic_read(&inode->i_count)); return; } if (inode->i_nlink) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", __func__, __LINE__, inode->i_ino, inode->i_nlink); return; } sbi = EXT4_SB(sb); ino = inode->i_ino; ext4_debug("freeing inode %lu\n", ino); trace_ext4_free_inode(inode); dquot_initialize(inode); dquot_free_inode(inode); is_directory = S_ISDIR(inode->i_mode); /* Do this BEFORE marking the inode not in use or returning an error */ ext4_clear_inode(inode); es = sbi->s_es; if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { ext4_error(sb, "reserved or nonexistent inode %lu", ino); goto error_return; } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); /* Don't bother if the inode bitmap is corrupt. */ if (IS_ERR(bitmap_bh)) { fatal = PTR_ERR(bitmap_bh); bitmap_bh = NULL; goto error_return; } if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, block_group); if (!grp || unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { fatal = -EFSCORRUPTED; goto error_return; } } BUFFER_TRACE(bitmap_bh, "get_write_access"); fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh, EXT4_JTR_NONE); if (fatal) goto error_return; fatal = -ESRCH; gdp = ext4_get_group_desc(sb, block_group, &bh2); if (gdp) { BUFFER_TRACE(bh2, "get_write_access"); fatal = ext4_journal_get_write_access(handle, sb, bh2, EXT4_JTR_NONE); } ext4_lock_group(sb, block_group); cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); if (fatal || !cleared) { ext4_unlock_group(sb, block_group); goto out; } count = ext4_free_inodes_count(sb, gdp) + 1; ext4_free_inodes_set(sb, gdp, count); if (is_directory) { count = ext4_used_dirs_count(sb, gdp) - 1; ext4_used_dirs_set(sb, gdp, count); if (percpu_counter_initialized(&sbi->s_dirs_counter)) percpu_counter_dec(&sbi->s_dirs_counter); } ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) percpu_counter_inc(&sbi->s_freeinodes_counter); if (sbi->s_log_groups_per_flex) { struct flex_groups *fg; fg = sbi_array_rcu_deref(sbi, s_flex_groups, ext4_flex_group(sbi, block_group)); atomic_inc(&fg->free_inodes); if (is_directory) atomic_dec(&fg->used_dirs); } BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); out: if (cleared) { BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); } error_return: brelse(bitmap_bh); ext4_std_error(sb, fatal); } struct orlov_stats { __u64 free_clusters; __u32 free_inodes; __u32 used_dirs; }; /* * Helper function for Orlov's allocator; returns critical information * for a particular block group or flex_bg. If flex_size is 1, then g * is a block group number; otherwise it is flex_bg number. */ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, int flex_size, struct orlov_stats *stats) { struct ext4_group_desc *desc; if (flex_size > 1) { struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb), s_flex_groups, g); stats->free_inodes = atomic_read(&fg->free_inodes); stats->free_clusters = atomic64_read(&fg->free_clusters); stats->used_dirs = atomic_read(&fg->used_dirs); return; } desc = ext4_get_group_desc(sb, g, NULL); if (desc) { stats->free_inodes = ext4_free_inodes_count(sb, desc); stats->free_clusters = ext4_free_group_clusters(sb, desc); stats->used_dirs = ext4_used_dirs_count(sb, desc); } else { stats->free_inodes = 0; stats->free_clusters = 0; stats->used_dirs = 0; } } /* * Orlov's allocator for directories. * * We always try to spread first-level directories. * * If there are blockgroups with both free inodes and free clusters counts * not worse than average we return one with smallest directory count. * Otherwise we simply return a random group. * * For the rest rules look so: * * It's OK to put directory into a group unless * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or * it has too few free clusters left (min_clusters) or * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). */ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4_group_t *group, umode_t mode, const struct qstr *qstr) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei, grp_free; ext4_fsblk_t freec, avefreec; unsigned int ndirs; int max_dirs, min_inodes; ext4_grpblk_t min_clusters; ext4_group_t i, grp, g, ngroups; struct ext4_group_desc *desc; struct orlov_stats stats; int flex_size = ext4_flex_bg_size(sbi); struct dx_hash_info hinfo; ngroups = real_ngroups; if (flex_size > 1) { ngroups = (real_ngroups + flex_size - 1) >> sbi->s_log_groups_per_flex; parent_group >>= sbi->s_log_groups_per_flex; } freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter); avefreec = freec; do_div(avefreec, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); if (S_ISDIR(mode) && ((parent == d_inode(sb->s_root)) || (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { int best_ndir = inodes_per_group; int ret = -1; if (qstr) { hinfo.hash_version = DX_HASH_HALF_MD4; hinfo.seed = sbi->s_hash_seed; ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo); parent_group = hinfo.hash % ngroups; } else parent_group = get_random_u32_below(ngroups); for (i = 0; i < ngroups; i++) { g = (parent_group + i) % ngroups; get_orlov_stats(sb, g, flex_size, &stats); if (!stats.free_inodes) continue; if (stats.used_dirs >= best_ndir) continue; if (stats.free_inodes < avefreei) continue; if (stats.free_clusters < avefreec) continue; grp = g; ret = 0; best_ndir = stats.used_dirs; } if (ret) goto fallback; found_flex_bg: if (flex_size == 1) { *group = grp; return 0; } /* * We pack inodes at the beginning of the flexgroup's * inode tables. Block allocation decisions will do * something similar, although regular files will * start at 2nd block group of the flexgroup. See * ext4_ext_find_goal() and ext4_find_near(). */ grp *= flex_size; for (i = 0; i < flex_size; i++) { if (grp+i >= real_ngroups) break; desc = ext4_get_group_desc(sb, grp+i, NULL); if (desc && ext4_free_inodes_count(sb, desc)) { *group = grp+i; return 0; } } goto fallback; } max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16; min_inodes = avefreei - inodes_per_group*flex_size / 4; if (min_inodes < 1) min_inodes = 1; min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; if (min_clusters < 0) min_clusters = 0; /* * Start looking in the flex group where we last allocated an * inode for this parent directory */ if (EXT4_I(parent)->i_last_alloc_group != ~0) { parent_group = EXT4_I(parent)->i_last_alloc_group; if (flex_size > 1) parent_group >>= sbi->s_log_groups_per_flex; } for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; get_orlov_stats(sb, grp, flex_size, &stats); if (stats.used_dirs >= max_dirs) continue; if (stats.free_inodes < min_inodes) continue; if (stats.free_clusters < min_clusters) continue; goto found_flex_bg; } fallback: ngroups = real_ngroups; avefreei = freei / ngroups; fallback_retry: parent_group = EXT4_I(parent)->i_block_group; for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); if (desc) { grp_free = ext4_free_inodes_count(sb, desc); if (grp_free && grp_free >= avefreei) { *group = grp; return 0; } } } if (avefreei) { /* * The free-inodes counter is approximate, and for really small * filesystems the above test can fail to find any blockgroups */ avefreei = 0; goto fallback_retry; } return -1; } static int find_group_other(struct super_block *sb, struct inode *parent, ext4_group_t *group, umode_t mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); /* * Try to place the inode is the same flex group as its * parent. If we can't find space, use the Orlov algorithm to * find another flex group, and store that information in the * parent directory's inode information so that use that flex * group for future allocations. */ if (flex_size > 1) { int retry = 0; try_again: parent_group &= ~(flex_size-1); last = parent_group + flex_size; if (last > ngroups) last = ngroups; for (i = parent_group; i < last; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc && ext4_free_inodes_count(sb, desc)) { *group = i; return 0; } } if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) { retry = 1; parent_group = EXT4_I(parent)->i_last_alloc_group; goto try_again; } /* * If this didn't work, use the Orlov search algorithm * to find a new flex group; we pass in the mode to * avoid the topdir algorithms. */ *group = parent_group + flex_size; if (*group > ngroups) *group = 0; return find_group_orlov(sb, parent, group, mode, NULL); } /* * Try to place the inode in its parent directory */ *group = parent_group; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && ext4_free_group_clusters(sb, desc)) return 0; /* * We're going to place this inode in a different blockgroup from its * parent. We want to cause files in a common directory to all land in * the same blockgroup. But we want files which are in a different * directory which shares a blockgroup with our parent to land in a * different blockgroup. * * So add our directory's i_ino into the starting point for the hash. */ *group = (*group + parent->i_ino) % ngroups; /* * Use a quadratic hash to find a group with a free inode and some free * blocks. */ for (i = 1; i < ngroups; i <<= 1) { *group += i; if (*group >= ngroups) *group -= ngroups; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc) && ext4_free_group_clusters(sb, desc)) return 0; } /* * That failed: try linear search for a free inode, even if that group * has no free blocks. */ *group = parent_group; for (i = 0; i < ngroups; i++) { if (++*group >= ngroups) *group = 0; desc = ext4_get_group_desc(sb, *group, NULL); if (desc && ext4_free_inodes_count(sb, desc)) return 0; } return -1; } /* * In no journal mode, if an inode has recently been deleted, we want * to avoid reusing it until we're reasonably sure the inode table * block has been written back to disk. (Yes, these values are * somewhat arbitrary...) */ #define RECENTCY_MIN 60 #define RECENTCY_DIRTY 300 static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) { struct ext4_group_desc *gdp; struct ext4_inode *raw_inode; struct buffer_head *bh; int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int offset, ret = 0; int recentcy = RECENTCY_MIN; u32 dtime, now; gdp = ext4_get_group_desc(sb, group, NULL); if (unlikely(!gdp)) return 0; bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) + (ino / inodes_per_block)); if (!bh || !buffer_uptodate(bh)) /* * If the block is not in the buffer cache, then it * must have been written out, or, most unlikely, is * being migrated - false failure should be OK here. */ goto out; offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); raw_inode = (struct ext4_inode *) (bh->b_data + offset); /* i_dtime is only 32 bits on disk, but we only care about relative * times in the range of a few minutes (i.e. long enough to sync a * recently-deleted inode to disk), so using the low 32 bits of the * clock (a 68 year range) is enough, see time_before32() */ dtime = le32_to_cpu(raw_inode->i_dtime); now = ktime_get_real_seconds(); if (buffer_dirty(bh)) recentcy += RECENTCY_DIRTY; if (dtime && time_before32(dtime, now) && time_before32(now, dtime + recentcy)) ret = 1; out: brelse(bh); return ret; } static int find_inode_bit(struct super_block *sb, ext4_group_t group, struct buffer_head *bitmap, unsigned long *ino) { bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL; unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb); next: *ino = ext4_find_next_zero_bit((unsigned long *) bitmap->b_data, EXT4_INODES_PER_GROUP(sb), *ino); if (*ino >= EXT4_INODES_PER_GROUP(sb)) goto not_found; if (check_recently_deleted && recently_deleted(sb, group, *ino)) { recently_deleted_ino = *ino; *ino = *ino + 1; if (*ino < EXT4_INODES_PER_GROUP(sb)) goto next; goto not_found; } return 1; not_found: if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb)) return 0; /* * Not reusing recently deleted inodes is mostly a preference. We don't * want to report ENOSPC or skew allocation patterns because of that. * So return even recently deleted inode if we could find better in the * given range. */ *ino = recently_deleted_ino; return 1; } int ext4_mark_inode_used(struct super_block *sb, int ino) { unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL; struct ext4_group_desc *gdp; ext4_group_t group; int bit; int err; if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) return -EFSCORRUPTED; group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); if (IS_ERR(inode_bitmap_bh)) return PTR_ERR(inode_bitmap_bh); if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) { err = 0; goto out; } gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) { err = -EINVAL; goto out; } ext4_set_bit(bit, inode_bitmap_bh->b_data); BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } err = sync_dirty_buffer(inode_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { struct buffer_head *block_bitmap_bh; block_bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(block_bitmap_bh)) { err = PTR_ERR(block_bitmap_bh); goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh); sync_dirty_buffer(block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); brelse(block_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } } /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; ext4_lock_group(sb, group); /* while we modify the bg desc */ free = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); free = 0; } /* * Check the relative inode number against the last used * relative inode number in this group. if it is greater * we need to update the bg_itable_unused count */ if (bit >= free) ext4_itable_unused_set(sb, gdp, (EXT4_INODES_PER_GROUP(sb) - bit - 1)); } else { ext4_lock_group(sb, group); } ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (ext4_has_group_desc_csum(sb)) { ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh); sync_dirty_buffer(group_desc_bh); out: brelse(inode_bitmap_bh); return err; } static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, bool encrypt) { struct super_block *sb = dir->i_sb; int nblocks = 0; #ifdef CONFIG_EXT4_FS_POSIX_ACL struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(p)) return PTR_ERR(p); if (p) { int acl_size = p->a_count * sizeof(ext4_acl_entry); nblocks += (S_ISDIR(mode) ? 2 : 1) * __ext4_xattr_set_credits(sb, NULL /* inode */, NULL /* block_bh */, acl_size, true /* is_create */); posix_acl_release(p); } #endif #ifdef CONFIG_SECURITY { int num_security_xattrs = 1; #ifdef CONFIG_INTEGRITY num_security_xattrs++; #endif /* * We assume that security xattrs are never more than 1k. * In practice they are under 128 bytes. */ nblocks += num_security_xattrs * __ext4_xattr_set_credits(sb, NULL /* inode */, NULL /* block_bh */, 1024, true /* is_create */); } #endif if (encrypt) nblocks += __ext4_xattr_set_credits(sb, NULL /* inode */, NULL /* block_bh */, FSCRYPT_SET_CONTEXT_MAX_SIZE, true /* is_create */); return nblocks; } /* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of * the groups with above-average free space, that group with the fewest * directories already is chosen. * * For other inodes, search forward from the parent directory's block * group to find a free inode. */ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, int handle_type, unsigned int line_no, int nblocks) { struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; struct buffer_head *group_desc_bh; ext4_group_t ngroups, group = 0; unsigned long ino = 0; struct inode *inode; struct ext4_group_desc *gdp = NULL; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; int ret2, err; struct inode *ret; ext4_group_t i; ext4_group_t flex_group; struct ext4_group_info *grp = NULL; bool encrypt = false; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); sb = dir->i_sb; sbi = EXT4_SB(sb); ret2 = ext4_emergency_state(sb); if (unlikely(ret2)) return ERR_PTR(ret2); ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); ei = EXT4_I(inode); /* * Initialize owners and quota early so that we don't have to account * for quota initialization worst case in standard inode creating * transaction */ if (owner) { inode->i_mode = mode; i_uid_write(inode, owner[0]); i_gid_write(inode, owner[1]); } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; inode_fsuid_set(inode, idmap); inode->i_gid = dir->i_gid; } else inode_init_owner(idmap, inode, dir, mode); if (ext4_has_feature_project(sb) && ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) ei->i_projid = EXT4_I(dir)->i_projid; else ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); if (!(i_flags & EXT4_EA_INODE_FL)) { err = fscrypt_prepare_new_inode(dir, inode, &encrypt); if (err) goto out; } err = dquot_initialize(inode); if (err) goto out; if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) { ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt); if (ret2 < 0) { err = ret2; goto out; } nblocks += ret2; } if (!goal) goal = sbi->s_inode_goal; if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); ret2 = 0; goto got_group; } if (S_ISDIR(mode)) ret2 = find_group_orlov(sb, dir, &group, mode, qstr); else ret2 = find_group_other(sb, dir, &group, mode); got_group: EXT4_I(dir)->i_last_alloc_group = group; err = -ENOSPC; if (ret2 == -1) goto out; /* * Normally we will only go through one pass of this loop, * unless we get unlucky and it turns out the group we selected * had its last inode grabbed by someone else. */ for (i = 0; i < ngroups; i++, ino = 0) { err = -EIO; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) goto out; /* * Check free inodes count before loading bitmap. */ if (ext4_free_inodes_count(sb, gdp) == 0) goto next_group; if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, group); /* * Skip groups with already-known suspicious inode * tables */ if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) goto next_group; } brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); /* Skip groups with suspicious inode tables */ if (IS_ERR(inode_bitmap_bh)) { inode_bitmap_bh = NULL; goto next_group; } if (!(sbi->s_mount_state & EXT4_FC_REPLAY) && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) goto next_group; ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); if (!ret2) goto next_group; if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto next_group; } if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(NULL, dir->i_sb, line_no, handle_type, nblocks, 0, ext4_trans_default_revoke_credits(sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_std_error(sb, err); goto out; } } BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh, EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; } ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); if (ret2) { /* Someone already took the bit. Repeat the search * with lock held. */ ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); if (ret2) { ext4_set_bit(ino, inode_bitmap_bh->b_data); ret2 = 0; } else { ret2 = 1; /* we didn't grab the inode */ } } ext4_unlock_group(sb, group); ino++; /* the inode bitmap is zero-based */ if (!ret2) goto got; /* we grabbed the inode! */ next_group: if (++group == ngroups) group = 0; } err = -ENOSPC; goto out; got: BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } BUFFER_TRACE(group_desc_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, group_desc_bh, EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; } /* We may have to initialize the block bitmap if it isn't already */ if (ext4_has_group_desc_csum(sb) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { struct buffer_head *block_bitmap_bh; block_bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(block_bitmap_bh)) { err = PTR_ERR(block_bitmap_bh); goto out; } BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh, EXT4_JTR_NONE); if (err) { brelse(block_bitmap_bh); ext4_std_error(sb, err); goto out; } BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); brelse(block_bitmap_bh); if (err) { ext4_std_error(sb, err); goto out; } } /* Update the relevant bg descriptor fields */ if (ext4_has_group_desc_csum(sb)) { int free; struct ext4_group_info *grp = NULL; if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) { grp = ext4_get_group_info(sb, group); if (!grp) { err = -EFSCORRUPTED; goto out; } down_read(&grp->alloc_sem); /* * protect vs itable * lazyinit */ } ext4_lock_group(sb, group); /* while we modify the bg desc */ free = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); free = 0; } /* * Check the relative inode number against the last used * relative inode number in this group. if it is greater * we need to update the bg_itable_unused count */ if (ino > free) ext4_itable_unused_set(sb, gdp, (EXT4_INODES_PER_GROUP(sb) - ino)); if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) up_read(&grp->alloc_sem); } else { ext4_lock_group(sb, group); } ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (S_ISDIR(mode)) { ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1); if (sbi->s_log_groups_per_flex) { ext4_group_t f = ext4_flex_group(sbi, group); atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups, f)->used_dirs); } } if (ext4_has_group_desc_csum(sb)) { ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); if (err) { ext4_std_error(sb, err); goto out; } percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups, flex_group)->free_inodes); } inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; simple_inode_init_ts(inode); ei->i_crtime = inode_get_mtime(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; ei->i_disksize = 0; /* Don't inherit extent flag from directory, amongst others. */ ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED); ei->i_flags |= i_flags; ei->i_file_acl = 0; ei->i_dtime = 0; ei->i_block_group = group; ei->i_last_alloc_group = ~0; ext4_set_inode_flags(inode, true); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { /* * Likely a bitmap corruption causing inode to be allocated * twice. */ err = -EIO; ext4_error(sb, "failed to insert inode %lu: doubly allocated?", inode->i_ino); ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto out; } inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ if (ext4_has_feature_metadata_csum(sb)) { __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); } ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = sbi->s_want_extra_isize; ei->i_inline_off = 0; if (ext4_has_feature_inline_data(sb) && (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode))) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); if (err) goto fail_drop; /* * Since the encryption xattr will always be unique, create it first so * that it's less likely to end up in an external xattr block and * prevent its deduplication. */ if (encrypt) { err = fscrypt_set_context(inode, handle); if (err) goto fail_free_drop; } if (!(ei->i_flags & EXT4_EA_INODE_FL)) { err = ext4_init_acl(handle, inode, dir); if (err) goto fail_free_drop; err = ext4_init_security(handle, inode, dir, qstr); if (err) goto fail_free_drop; } if (ext4_has_feature_extents(sb)) { /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode); } } ext4_update_inode_fsync_trans(handle, inode, 1); err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); goto fail_free_drop; } ext4_debug("allocating inode %lu\n", inode->i_ino); trace_ext4_allocate_inode(inode, dir, mode); brelse(inode_bitmap_bh); return ret; fail_free_drop: dquot_free_inode(inode); fail_drop: clear_nlink(inode); unlock_new_inode(inode); out: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; iput(inode); brelse(inode_bitmap_bh); return ERR_PTR(err); } /* Verify that we are loading a valid orphan from disk */ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) { unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); ext4_group_t block_group; int bit; struct buffer_head *bitmap_bh = NULL; struct inode *inode = NULL; int err = -EFSCORRUPTED; if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) goto bad_orphan; block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) return ERR_CAST(bitmap_bh); /* Having the inode bit set should be a 100% indicator that this * is a valid orphan (no e2fsck run on fs). Orphans also include * inodes that were being truncated, so we can't check i_nlink==0. */ if (!ext4_test_bit(bit, bitmap_bh->b_data)) goto bad_orphan; inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error_err(sb, -err, "couldn't read orphan inode %lu (err %d)", ino, err); brelse(bitmap_bh); return inode; } /* * If the orphans has i_nlinks > 0 then it should be able to * be truncated, otherwise it won't be removed from the orphan * list during processing and an infinite loop will result. * Similarly, it must not be a bad inode. */ if ((inode->i_nlink && !ext4_can_truncate(inode)) || is_bad_inode(inode)) goto bad_orphan; if (NEXT_ORPHAN(inode) > max_ino) goto bad_orphan; brelse(bitmap_bh); return inode; bad_orphan: ext4_error(sb, "bad orphan inode %lu", ino); if (bitmap_bh) printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, ext4_test_bit(bit, bitmap_bh->b_data)); if (inode) { printk(KERN_ERR "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); printk(KERN_ERR "max_ino=%lu\n", max_ino); printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; iput(inode); } brelse(bitmap_bh); return ERR_PTR(err); } unsigned long ext4_count_free_inodes(struct super_block *sb) { unsigned long desc_count; struct ext4_group_desc *gdp; ext4_group_t i, ngroups = ext4_get_groups_count(sb); #ifdef EXT4FS_DEBUG struct ext4_super_block *es; unsigned long bitmap_count, x; struct buffer_head *bitmap_bh = NULL; es = EXT4_SB(sb)->s_es; desc_count = 0; bitmap_count = 0; gdp = NULL; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; desc_count += ext4_free_inodes_count(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_inode_bitmap(sb, i); if (IS_ERR(bitmap_bh)) { bitmap_bh = NULL; continue; } x = ext4_count_free(bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", (unsigned long) i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_inodes: " "stored = %u, computed = %lu, %lu\n", le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); return desc_count; #else desc_count = 0; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; desc_count += ext4_free_inodes_count(sb, gdp); cond_resched(); } return desc_count; #endif } /* Called at mount-time, super-block is locked */ unsigned long ext4_count_dirs(struct super_block * sb) { unsigned long count = 0; ext4_group_t i, ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; count += ext4_used_dirs_count(sb, gdp); } return count; } /* * Zeroes not yet zeroed inode table - just write zeroes through the whole * inode table. Must be called without any spinlock held. The only place * where it is called from on active part of filesystem is ext4lazyinit * thread, so we do not need any special locks, however we have to prevent * inode allocation from the current group, so we take alloc_sem lock, to * block ext4_new_inode() until we are finished. */ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; struct buffer_head *group_desc_bh; handle_t *handle; ext4_fsblk_t blk; int num, ret = 0, used_blks = 0; unsigned long used_inos = 0; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp || !grp) goto out; /* * We do not need to lock this, because we are the only one * handling this flag. */ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)) goto out; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } down_write(&grp->alloc_sem); /* * If inode bitmap was already initialized there may be some * used inodes so we need to skip blocks with used inodes in * inode table. */ if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { used_inos = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp); used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block); /* Bogus inode unused count? */ if (used_blks < 0 || used_blks > sbi->s_itb_per_group) { ext4_error(sb, "Something is wrong with group %u: " "used itable blocks: %d; " "itable unused count: %u", group, used_blks, ext4_itable_unused_count(sb, gdp)); ret = 1; goto err_out; } used_inos += group * EXT4_INODES_PER_GROUP(sb); /* * Are there some uninitialized inodes in the inode table * before the first normal inode? */ if ((used_blks != sbi->s_itb_per_group) && (used_inos < EXT4_FIRST_INO(sb))) { ext4_error(sb, "Something is wrong with group %u: " "itable unused count: %u; " "itables initialized count: %ld", group, ext4_itable_unused_count(sb, gdp), used_inos); ret = 1; goto err_out; } } blk = ext4_inode_table(sb, gdp) + used_blks; num = sbi->s_itb_per_group - used_blks; BUFFER_TRACE(group_desc_bh, "get_write_access"); ret = ext4_journal_get_write_access(handle, sb, group_desc_bh, EXT4_JTR_NONE); if (ret) goto err_out; /* * Skip zeroout if the inode table is full. But we set the ZEROED * flag anyway, because obviously, when it is full it does not need * further zeroing. */ if (unlikely(num == 0)) goto skip_zeroout; ext4_debug("going to zero out inode table in group %d\n", group); ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS); if (ret < 0) goto err_out; if (barrier) blkdev_issue_flush(sb->s_bdev); skip_zeroout: ext4_lock_group(sb, group); gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); ret = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); err_out: up_write(&grp->alloc_sem); ext4_journal_stop(handle); out: return ret; } |
33 2 38 2 2 2 33 2 3 2 27 26 19 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | // SPDX-License-Identifier: GPL-2.0 /* * rtc and date/time utility functions * * Copyright (C) 2005-06 Tower Technologies * Author: Alessandro Zummo <a.zummo@towertech.it> * * based on arch/arm/common/rtctime.c and other bits * * Author: Cassio Neri <cassio.neri@gmail.com> (rtc_time64_to_tm) */ #include <linux/export.h> #include <linux/rtc.h> static const unsigned char rtc_days_in_month[] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }; static const unsigned short rtc_ydays[2][13] = { /* Normal years */ { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 }, /* Leap years */ { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 } }; /* * The number of days in the month. */ int rtc_month_days(unsigned int month, unsigned int year) { return rtc_days_in_month[month] + (is_leap_year(year) && month == 1); } EXPORT_SYMBOL(rtc_month_days); /* * The number of days since January 1. (0 to 365) */ int rtc_year_days(unsigned int day, unsigned int month, unsigned int year) { return rtc_ydays[is_leap_year(year)][month] + day - 1; } EXPORT_SYMBOL(rtc_year_days); /** * rtc_time64_to_tm - converts time64_t to rtc_time. * * @time: The number of seconds since 01-01-1970 00:00:00. * (Must be positive.) * @tm: Pointer to the struct rtc_time. */ void rtc_time64_to_tm(time64_t time, struct rtc_time *tm) { unsigned int secs; int days; u64 u64tmp; u32 u32tmp, udays, century, day_of_century, year_of_century, year, day_of_year, month, day; bool is_Jan_or_Feb, is_leap_year; /* time must be positive */ days = div_s64_rem(time, 86400, &secs); /* day of the week, 1970-01-01 was a Thursday */ tm->tm_wday = (days + 4) % 7; /* * The following algorithm is, basically, Proposition 6.3 of Neri * and Schneider [1]. In a few words: it works on the computational * (fictitious) calendar where the year starts in March, month = 2 * (*), and finishes in February, month = 13. This calendar is * mathematically convenient because the day of the year does not * depend on whether the year is leap or not. For instance: * * March 1st 0-th day of the year; * ... * April 1st 31-st day of the year; * ... * January 1st 306-th day of the year; (Important!) * ... * February 28th 364-th day of the year; * February 29th 365-th day of the year (if it exists). * * After having worked out the date in the computational calendar * (using just arithmetics) it's easy to convert it to the * corresponding date in the Gregorian calendar. * * [1] "Euclidean Affine Functions and Applications to Calendar * Algorithms". https://arxiv.org/abs/2102.06959 * * (*) The numbering of months follows rtc_time more closely and * thus, is slightly different from [1]. */ udays = ((u32) days) + 719468; u32tmp = 4 * udays + 3; century = u32tmp / 146097; day_of_century = u32tmp % 146097 / 4; u32tmp = 4 * day_of_century + 3; u64tmp = 2939745ULL * u32tmp; year_of_century = upper_32_bits(u64tmp); day_of_year = lower_32_bits(u64tmp) / 2939745 / 4; year = 100 * century + year_of_century; is_leap_year = year_of_century != 0 ? year_of_century % 4 == 0 : century % 4 == 0; u32tmp = 2141 * day_of_year + 132377; month = u32tmp >> 16; day = ((u16) u32tmp) / 2141; /* * Recall that January 01 is the 306-th day of the year in the * computational (not Gregorian) calendar. */ is_Jan_or_Feb = day_of_year >= 306; /* Converts to the Gregorian calendar. */ year = year + is_Jan_or_Feb; month = is_Jan_or_Feb ? month - 12 : month; day = day + 1; day_of_year = is_Jan_or_Feb ? day_of_year - 306 : day_of_year + 31 + 28 + is_leap_year; /* Converts to rtc_time's format. */ tm->tm_year = (int) (year - 1900); tm->tm_mon = (int) month; tm->tm_mday = (int) day; tm->tm_yday = (int) day_of_year + 1; tm->tm_hour = secs / 3600; secs -= tm->tm_hour * 3600; tm->tm_min = secs / 60; tm->tm_sec = secs - tm->tm_min * 60; tm->tm_isdst = 0; } EXPORT_SYMBOL(rtc_time64_to_tm); /* * Does the rtc_time represent a valid date/time? */ int rtc_valid_tm(struct rtc_time *tm) { if (tm->tm_year < 70 || tm->tm_year > (INT_MAX - 1900) || ((unsigned int)tm->tm_mon) >= 12 || tm->tm_mday < 1 || tm->tm_mday > rtc_month_days(tm->tm_mon, ((unsigned int)tm->tm_year + 1900)) || ((unsigned int)tm->tm_hour) >= 24 || ((unsigned int)tm->tm_min) >= 60 || ((unsigned int)tm->tm_sec) >= 60) return -EINVAL; return 0; } EXPORT_SYMBOL(rtc_valid_tm); /* * rtc_tm_to_time64 - Converts rtc_time to time64_t. * Convert Gregorian date to seconds since 01-01-1970 00:00:00. */ time64_t rtc_tm_to_time64(struct rtc_time *tm) { return mktime64(((unsigned int)tm->tm_year + 1900), tm->tm_mon + 1, tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec); } EXPORT_SYMBOL(rtc_tm_to_time64); /* * Convert rtc_time to ktime */ ktime_t rtc_tm_to_ktime(struct rtc_time tm) { return ktime_set(rtc_tm_to_time64(&tm), 0); } EXPORT_SYMBOL_GPL(rtc_tm_to_ktime); /* * Convert ktime to rtc_time */ struct rtc_time rtc_ktime_to_tm(ktime_t kt) { struct timespec64 ts; struct rtc_time ret; ts = ktime_to_timespec64(kt); /* Round up any ns */ if (ts.tv_nsec) ts.tv_sec++; rtc_time64_to_tm(ts.tv_sec, &ret); return ret; } EXPORT_SYMBOL_GPL(rtc_ktime_to_tm); |
98 111 75 74 2 75 27 102 102 25 50 50 171 1 169 171 5 1 4 4 3 9 9 5 4 22 22 15 15 22 199 196 11 8 1 11 5 5 1 5 4 1 3 3 3 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Spanning tree protocol; interface code * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/switchdev.h> #include "br_private.h" #include "br_private_stp.h" /* Port id is composed of priority and port number. * NB: some bits of priority are dropped to * make room for more ports. */ static inline port_id br_make_port_id(__u8 priority, __u16 port_no) { return ((u16)priority << BR_PORT_BITS) | (port_no & ((1<<BR_PORT_BITS)-1)); } #define BR_MAX_PORT_PRIORITY ((u16)~0 >> BR_PORT_BITS) /* called under bridge lock */ void br_init_port(struct net_bridge_port *p) { int err; p->port_id = br_make_port_id(p->priority, p->port_no); br_become_designated_port(p); br_set_state(p, BR_STATE_BLOCKING); p->topology_change_ack = 0; p->config_pending = 0; err = __set_ageing_time(p->dev, p->br->ageing_time); if (err) netdev_err(p->dev, "failed to offload ageing time\n"); } /* NO locks held */ void br_stp_enable_bridge(struct net_bridge *br) { struct net_bridge_port *p; spin_lock_bh(&br->lock); if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, jiffies + br->hello_time); mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10); br_config_bpdu_generation(br); list_for_each_entry(p, &br->port_list, list) { if (netif_running(p->dev) && netif_oper_up(p->dev)) br_stp_enable_port(p); } spin_unlock_bh(&br->lock); } /* NO locks held */ void br_stp_disable_bridge(struct net_bridge *br) { struct net_bridge_port *p; spin_lock_bh(&br->lock); list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED) br_stp_disable_port(p); } __br_set_topology_change(br, 0); br->topology_change_detected = 0; spin_unlock_bh(&br->lock); timer_delete_sync(&br->hello_timer); timer_delete_sync(&br->topology_change_timer); timer_delete_sync(&br->tcn_timer); cancel_delayed_work_sync(&br->gc_work); } /* called under bridge lock */ void br_stp_enable_port(struct net_bridge_port *p) { br_init_port(p); br_port_state_selection(p->br); br_ifinfo_notify(RTM_NEWLINK, NULL, p); } /* called under bridge lock */ void br_stp_disable_port(struct net_bridge_port *p) { struct net_bridge *br = p->br; int wasroot; wasroot = br_is_root_bridge(br); br_become_designated_port(p); br_set_state(p, BR_STATE_DISABLED); p->topology_change_ack = 0; p->config_pending = 0; br_ifinfo_notify(RTM_NEWLINK, NULL, p); timer_delete(&p->message_age_timer); timer_delete(&p->forward_delay_timer); timer_delete(&p->hold_timer); if (!rcu_access_pointer(p->backup_port)) br_fdb_delete_by_port(br, p, 0, 0); br_multicast_disable_port(p); br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); } static int br_stp_call_user(struct net_bridge *br, char *arg) { char *argv[] = { BR_STP_PROG, br->dev->name, arg, NULL }; char *envp[] = { NULL }; int rc; /* call userspace STP and report program errors */ rc = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); if (rc > 0) { if (rc & 0xff) br_debug(br, BR_STP_PROG " received signal %d\n", rc & 0x7f); else br_debug(br, BR_STP_PROG " exited with code %d\n", (rc >> 8) & 0xff); } return rc; } static void br_stp_start(struct net_bridge *br) { int err = -ENOENT; if (net_eq(dev_net(br->dev), &init_net)) err = br_stp_call_user(br, "start"); if (err && err != -ENOENT) br_err(br, "failed to start userspace STP (%d)\n", err); spin_lock_bh(&br->lock); if (br->bridge_forward_delay < BR_MIN_FORWARD_DELAY) __br_set_forward_delay(br, BR_MIN_FORWARD_DELAY); else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY) __br_set_forward_delay(br, BR_MAX_FORWARD_DELAY); if (!err) { br->stp_enabled = BR_USER_STP; br_debug(br, "userspace STP started\n"); } else { br->stp_enabled = BR_KERNEL_STP; br_debug(br, "using kernel STP\n"); /* To start timers on any ports left in blocking */ if (br->dev->flags & IFF_UP) mod_timer(&br->hello_timer, jiffies + br->hello_time); br_port_state_selection(br); } spin_unlock_bh(&br->lock); } static void br_stp_stop(struct net_bridge *br) { int err; if (br->stp_enabled == BR_USER_STP) { err = br_stp_call_user(br, "stop"); if (err) br_err(br, "failed to stop userspace STP (%d)\n", err); /* To start timers on any ports left in blocking */ spin_lock_bh(&br->lock); br_port_state_selection(br); spin_unlock_bh(&br->lock); } br->stp_enabled = BR_NO_STP; } int br_stp_set_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (br_mrp_enabled(br)) { NL_SET_ERR_MSG_MOD(extack, "STP can't be enabled if MRP is already enabled"); return -EINVAL; } if (val) { if (br->stp_enabled == BR_NO_STP) br_stp_start(br); } else { if (br->stp_enabled != BR_NO_STP) br_stp_stop(br); } return 0; } /* called under bridge lock */ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) { /* should be aligned on 2 bytes for ether_addr_equal() */ unsigned short oldaddr_aligned[ETH_ALEN >> 1]; unsigned char *oldaddr = (unsigned char *)oldaddr_aligned; struct net_bridge_port *p; int wasroot; wasroot = br_is_root_bridge(br); br_fdb_change_mac_address(br, addr); memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN); memcpy(br->bridge_id.addr, addr, ETH_ALEN); eth_hw_addr_set(br->dev, addr); list_for_each_entry(p, &br->port_list, list) { if (ether_addr_equal(p->designated_bridge.addr, oldaddr)) memcpy(p->designated_bridge.addr, addr, ETH_ALEN); if (ether_addr_equal(p->designated_root.addr, oldaddr)) memcpy(p->designated_root.addr, addr, ETH_ALEN); } br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); } /* should be aligned on 2 bytes for ether_addr_equal() */ static const unsigned short br_mac_zero_aligned[ETH_ALEN >> 1]; /* called under bridge lock */ bool br_stp_recalculate_bridge_id(struct net_bridge *br) { const unsigned char *br_mac_zero = (const unsigned char *)br_mac_zero_aligned; const unsigned char *addr = br_mac_zero; struct net_bridge_port *p; /* user has chosen a value so keep it */ if (br->dev->addr_assign_type == NET_ADDR_SET) return false; list_for_each_entry(p, &br->port_list, list) { if (addr == br_mac_zero || memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0) addr = p->dev->dev_addr; } if (ether_addr_equal(br->bridge_id.addr, addr)) return false; /* no change */ br_stp_change_bridge_id(br, addr); return true; } /* Acquires and releases bridge lock */ void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio) { struct net_bridge_port *p; int wasroot; spin_lock_bh(&br->lock); wasroot = br_is_root_bridge(br); list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED && br_is_designated_port(p)) { p->designated_bridge.prio[0] = (newprio >> 8) & 0xFF; p->designated_bridge.prio[1] = newprio & 0xFF; } } br->bridge_id.prio[0] = (newprio >> 8) & 0xFF; br->bridge_id.prio[1] = newprio & 0xFF; br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); spin_unlock_bh(&br->lock); } /* called under bridge lock */ int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio) { port_id new_port_id; if (newprio > BR_MAX_PORT_PRIORITY) return -ERANGE; new_port_id = br_make_port_id(newprio, p->port_no); if (br_is_designated_port(p)) p->designated_port = new_port_id; p->port_id = new_port_id; p->priority = newprio; if (!memcmp(&p->br->bridge_id, &p->designated_bridge, 8) && p->port_id < p->designated_port) { br_become_designated_port(p); br_port_state_selection(p->br); } return 0; } /* called under bridge lock */ int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost) { if (path_cost < BR_MIN_PATH_COST || path_cost > BR_MAX_PATH_COST) return -ERANGE; p->flags |= BR_ADMIN_COST; p->path_cost = path_cost; br_configuration_update(p->br); br_port_state_selection(p->br); return 0; } ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id) { return sprintf(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n", id->prio[0], id->prio[1], id->addr[0], id->addr[1], id->addr[2], id->addr[3], id->addr[4], id->addr[5]); } |
2 1 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 | // SPDX-License-Identifier: GPL-2.0+ /* * inode.c -- user mode filesystem api for usb gadget controllers * * Copyright (C) 2003-2004 David Brownell * Copyright (C) 2003 Agilent Technologies */ /* #define VERBOSE_DEBUG */ #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/pagemap.h> #include <linux/uts.h> #include <linux/wait.h> #include <linux/compiler.h> #include <linux/uaccess.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/string_choices.h> #include <linux/poll.h> #include <linux/kthread.h> #include <linux/aio.h> #include <linux/uio.h> #include <linux/refcount.h> #include <linux/delay.h> #include <linux/device.h> #include <linux/moduleparam.h> #include <linux/usb/gadgetfs.h> #include <linux/usb/gadget.h> #include <linux/usb/composite.h> /* for USB_GADGET_DELAYED_STATUS */ /* Undef helpers from linux/usb/composite.h as gadgetfs redefines them */ #undef DBG #undef ERROR #undef INFO /* * The gadgetfs API maps each endpoint to a file descriptor so that you * can use standard synchronous read/write calls for I/O. There's some * O_NONBLOCK and O_ASYNC/FASYNC style i/o support. Example usermode * drivers show how this works in practice. You can also use AIO to * eliminate I/O gaps between requests, to help when streaming data. * * Key parts that must be USB-specific are protocols defining how the * read/write operations relate to the hardware state machines. There * are two types of files. One type is for the device, implementing ep0. * The other type is for each IN or OUT endpoint. In both cases, the * user mode driver must configure the hardware before using it. * * - First, dev_config() is called when /dev/gadget/$CHIP is configured * (by writing configuration and device descriptors). Afterwards it * may serve as a source of device events, used to handle all control * requests other than basic enumeration. * * - Then, after a SET_CONFIGURATION control request, ep_config() is * called when each /dev/gadget/ep* file is configured (by writing * endpoint descriptors). Afterwards these files are used to write() * IN data or to read() OUT data. To halt the endpoint, a "wrong * direction" request is issued (like reading an IN endpoint). * * Unlike "usbfs" the only ioctl()s are for things that are rare, and maybe * not possible on all hardware. For example, precise fault handling with * respect to data left in endpoint fifos after aborted operations; or * selective clearing of endpoint halts, to implement SET_INTERFACE. */ #define DRIVER_DESC "USB Gadget filesystem" #define DRIVER_VERSION "24 Aug 2004" static const char driver_desc [] = DRIVER_DESC; static const char shortname [] = "gadgetfs"; MODULE_DESCRIPTION (DRIVER_DESC); MODULE_AUTHOR ("David Brownell"); MODULE_LICENSE ("GPL"); static int ep_open(struct inode *, struct file *); /*----------------------------------------------------------------------*/ #define GADGETFS_MAGIC 0xaee71ee7 /* /dev/gadget/$CHIP represents ep0 and the whole device */ enum ep0_state { /* DISABLED is the initial state. */ STATE_DEV_DISABLED = 0, /* Only one open() of /dev/gadget/$CHIP; only one file tracks * ep0/device i/o modes and binding to the controller. Driver * must always write descriptors to initialize the device, then * the device becomes UNCONNECTED until enumeration. */ STATE_DEV_OPENED, /* From then on, ep0 fd is in either of two basic modes: * - (UN)CONNECTED: read usb_gadgetfs_event(s) from it * - SETUP: read/write will transfer control data and succeed; * or if "wrong direction", performs protocol stall */ STATE_DEV_UNCONNECTED, STATE_DEV_CONNECTED, STATE_DEV_SETUP, /* UNBOUND means the driver closed ep0, so the device won't be * accessible again (DEV_DISABLED) until all fds are closed. */ STATE_DEV_UNBOUND, }; /* enough for the whole queue: most events invalidate others */ #define N_EVENT 5 #define RBUF_SIZE 256 struct dev_data { spinlock_t lock; refcount_t count; int udc_usage; enum ep0_state state; /* P: lock */ struct usb_gadgetfs_event event [N_EVENT]; unsigned ev_next; struct fasync_struct *fasync; u8 current_config; /* drivers reading ep0 MUST handle control requests (SETUP) * reported that way; else the host will time out. */ unsigned usermode_setup : 1, setup_in : 1, setup_can_stall : 1, setup_out_ready : 1, setup_out_error : 1, setup_abort : 1, gadget_registered : 1; unsigned setup_wLength; /* the rest is basically write-once */ struct usb_config_descriptor *config, *hs_config; struct usb_device_descriptor *dev; struct usb_request *req; struct usb_gadget *gadget; struct list_head epfiles; void *buf; wait_queue_head_t wait; struct super_block *sb; struct dentry *dentry; /* except this scratch i/o buffer for ep0 */ u8 rbuf[RBUF_SIZE]; }; static inline void get_dev (struct dev_data *data) { refcount_inc (&data->count); } static void put_dev (struct dev_data *data) { if (likely (!refcount_dec_and_test (&data->count))) return; /* needs no more cleanup */ BUG_ON (waitqueue_active (&data->wait)); kfree (data); } static struct dev_data *dev_new (void) { struct dev_data *dev; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; dev->state = STATE_DEV_DISABLED; refcount_set (&dev->count, 1); spin_lock_init (&dev->lock); INIT_LIST_HEAD (&dev->epfiles); init_waitqueue_head (&dev->wait); return dev; } /*----------------------------------------------------------------------*/ /* other /dev/gadget/$ENDPOINT files represent endpoints */ enum ep_state { STATE_EP_DISABLED = 0, STATE_EP_READY, STATE_EP_ENABLED, STATE_EP_UNBOUND, }; struct ep_data { struct mutex lock; enum ep_state state; refcount_t count; struct dev_data *dev; /* must hold dev->lock before accessing ep or req */ struct usb_ep *ep; struct usb_request *req; ssize_t status; char name [16]; struct usb_endpoint_descriptor desc, hs_desc; struct list_head epfiles; wait_queue_head_t wait; struct dentry *dentry; }; static inline void get_ep (struct ep_data *data) { refcount_inc (&data->count); } static void put_ep (struct ep_data *data) { if (likely (!refcount_dec_and_test (&data->count))) return; put_dev (data->dev); /* needs no more cleanup */ BUG_ON (!list_empty (&data->epfiles)); BUG_ON (waitqueue_active (&data->wait)); kfree (data); } /*----------------------------------------------------------------------*/ /* most "how to use the hardware" policy choices are in userspace: * mapping endpoint roles (which the driver needs) to the capabilities * which the usb controller has. most of those capabilities are exposed * implicitly, starting with the driver name and then endpoint names. */ static const char *CHIP; static DEFINE_MUTEX(sb_mutex); /* Serialize superblock operations */ /*----------------------------------------------------------------------*/ /* NOTE: don't use dev_printk calls before binding to the gadget * at the end of ep0 configuration, or after unbind. */ /* too wordy: dev_printk(level , &(d)->gadget->dev , fmt , ## args) */ #define xprintk(d,level,fmt,args...) \ printk(level "%s: " fmt , shortname , ## args) #ifdef DEBUG #define DBG(dev,fmt,args...) \ xprintk(dev , KERN_DEBUG , fmt , ## args) #else #define DBG(dev,fmt,args...) \ do { } while (0) #endif /* DEBUG */ #ifdef VERBOSE_DEBUG #define VDEBUG DBG #else #define VDEBUG(dev,fmt,args...) \ do { } while (0) #endif /* DEBUG */ #define ERROR(dev,fmt,args...) \ xprintk(dev , KERN_ERR , fmt , ## args) #define INFO(dev,fmt,args...) \ xprintk(dev , KERN_INFO , fmt , ## args) /*----------------------------------------------------------------------*/ /* SYNCHRONOUS ENDPOINT OPERATIONS (bulk/intr/iso) * * After opening, configure non-control endpoints. Then use normal * stream read() and write() requests; and maybe ioctl() to get more * precise FIFO status when recovering from cancellation. */ static void epio_complete (struct usb_ep *ep, struct usb_request *req) { struct ep_data *epdata = ep->driver_data; if (!req->context) return; if (req->status) epdata->status = req->status; else epdata->status = req->actual; complete ((struct completion *)req->context); } /* tasklock endpoint, returning when it's connected. * still need dev->lock to use epdata->ep. */ static int get_ready_ep (unsigned f_flags, struct ep_data *epdata, bool is_write) { int val; if (f_flags & O_NONBLOCK) { if (!mutex_trylock(&epdata->lock)) goto nonblock; if (epdata->state != STATE_EP_ENABLED && (!is_write || epdata->state != STATE_EP_READY)) { mutex_unlock(&epdata->lock); nonblock: val = -EAGAIN; } else val = 0; return val; } val = mutex_lock_interruptible(&epdata->lock); if (val < 0) return val; switch (epdata->state) { case STATE_EP_ENABLED: return 0; case STATE_EP_READY: /* not configured yet */ if (is_write) return 0; fallthrough; case STATE_EP_UNBOUND: /* clean disconnect */ break; // case STATE_EP_DISABLED: /* "can't happen" */ default: /* error! */ pr_debug ("%s: ep %p not available, state %d\n", shortname, epdata, epdata->state); } mutex_unlock(&epdata->lock); return -ENODEV; } static ssize_t ep_io (struct ep_data *epdata, void *buf, unsigned len) { DECLARE_COMPLETION_ONSTACK (done); int value; spin_lock_irq (&epdata->dev->lock); if (likely (epdata->ep != NULL)) { struct usb_request *req = epdata->req; req->context = &done; req->complete = epio_complete; req->buf = buf; req->length = len; value = usb_ep_queue (epdata->ep, req, GFP_ATOMIC); } else value = -ENODEV; spin_unlock_irq (&epdata->dev->lock); if (likely (value == 0)) { value = wait_for_completion_interruptible(&done); if (value != 0) { spin_lock_irq (&epdata->dev->lock); if (likely (epdata->ep != NULL)) { DBG (epdata->dev, "%s i/o interrupted\n", epdata->name); usb_ep_dequeue (epdata->ep, epdata->req); spin_unlock_irq (&epdata->dev->lock); wait_for_completion(&done); if (epdata->status == -ECONNRESET) epdata->status = -EINTR; } else { spin_unlock_irq (&epdata->dev->lock); DBG (epdata->dev, "endpoint gone\n"); wait_for_completion(&done); epdata->status = -ENODEV; } } return epdata->status; } return value; } static int ep_release (struct inode *inode, struct file *fd) { struct ep_data *data = fd->private_data; int value; value = mutex_lock_interruptible(&data->lock); if (value < 0) return value; /* clean up if this can be reopened */ if (data->state != STATE_EP_UNBOUND) { data->state = STATE_EP_DISABLED; data->desc.bDescriptorType = 0; data->hs_desc.bDescriptorType = 0; usb_ep_disable(data->ep); } mutex_unlock(&data->lock); put_ep (data); return 0; } static long ep_ioctl(struct file *fd, unsigned code, unsigned long value) { struct ep_data *data = fd->private_data; int status; if ((status = get_ready_ep (fd->f_flags, data, false)) < 0) return status; spin_lock_irq (&data->dev->lock); if (likely (data->ep != NULL)) { switch (code) { case GADGETFS_FIFO_STATUS: status = usb_ep_fifo_status (data->ep); break; case GADGETFS_FIFO_FLUSH: usb_ep_fifo_flush (data->ep); break; case GADGETFS_CLEAR_HALT: status = usb_ep_clear_halt (data->ep); break; default: status = -ENOTTY; } } else status = -ENODEV; spin_unlock_irq (&data->dev->lock); mutex_unlock(&data->lock); return status; } /*----------------------------------------------------------------------*/ /* ASYNCHRONOUS ENDPOINT I/O OPERATIONS (bulk/intr/iso) */ struct kiocb_priv { struct usb_request *req; struct ep_data *epdata; struct kiocb *iocb; struct mm_struct *mm; struct work_struct work; void *buf; struct iov_iter to; const void *to_free; unsigned actual; }; static int ep_aio_cancel(struct kiocb *iocb) { struct kiocb_priv *priv = iocb->private; struct ep_data *epdata; int value; local_irq_disable(); epdata = priv->epdata; // spin_lock(&epdata->dev->lock); if (likely(epdata && epdata->ep && priv->req)) value = usb_ep_dequeue (epdata->ep, priv->req); else value = -EINVAL; // spin_unlock(&epdata->dev->lock); local_irq_enable(); return value; } static void ep_user_copy_worker(struct work_struct *work) { struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work); struct mm_struct *mm = priv->mm; struct kiocb *iocb = priv->iocb; size_t ret; kthread_use_mm(mm); ret = copy_to_iter(priv->buf, priv->actual, &priv->to); kthread_unuse_mm(mm); if (!ret) ret = -EFAULT; /* completing the iocb can drop the ctx and mm, don't touch mm after */ iocb->ki_complete(iocb, ret); kfree(priv->buf); kfree(priv->to_free); kfree(priv); } static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) { struct kiocb *iocb = req->context; struct kiocb_priv *priv = iocb->private; struct ep_data *epdata = priv->epdata; /* lock against disconnect (and ideally, cancel) */ spin_lock(&epdata->dev->lock); priv->req = NULL; priv->epdata = NULL; /* if this was a write or a read returning no data then we * don't need to copy anything to userspace, so we can * complete the aio request immediately. */ if (priv->to_free == NULL || unlikely(req->actual == 0)) { kfree(req->buf); kfree(priv->to_free); kfree(priv); iocb->private = NULL; iocb->ki_complete(iocb, req->actual ? req->actual : (long)req->status); } else { /* ep_copy_to_user() won't report both; we hide some faults */ if (unlikely(0 != req->status)) DBG(epdata->dev, "%s fault %d len %d\n", ep->name, req->status, req->actual); priv->buf = req->buf; priv->actual = req->actual; INIT_WORK(&priv->work, ep_user_copy_worker); schedule_work(&priv->work); } usb_ep_free_request(ep, req); spin_unlock(&epdata->dev->lock); put_ep(epdata); } static ssize_t ep_aio(struct kiocb *iocb, struct kiocb_priv *priv, struct ep_data *epdata, char *buf, size_t len) { struct usb_request *req; ssize_t value; iocb->private = priv; priv->iocb = iocb; kiocb_set_cancel_fn(iocb, ep_aio_cancel); get_ep(epdata); priv->epdata = epdata; priv->actual = 0; priv->mm = current->mm; /* mm teardown waits for iocbs in exit_aio() */ /* each kiocb is coupled to one usb_request, but we can't * allocate or submit those if the host disconnected. */ spin_lock_irq(&epdata->dev->lock); value = -ENODEV; if (unlikely(epdata->ep == NULL)) goto fail; req = usb_ep_alloc_request(epdata->ep, GFP_ATOMIC); value = -ENOMEM; if (unlikely(!req)) goto fail; priv->req = req; req->buf = buf; req->length = len; req->complete = ep_aio_complete; req->context = iocb; value = usb_ep_queue(epdata->ep, req, GFP_ATOMIC); if (unlikely(0 != value)) { usb_ep_free_request(epdata->ep, req); goto fail; } spin_unlock_irq(&epdata->dev->lock); return -EIOCBQUEUED; fail: spin_unlock_irq(&epdata->dev->lock); kfree(priv->to_free); kfree(priv); put_ep(epdata); return value; } static ssize_t ep_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct ep_data *epdata = file->private_data; size_t len = iov_iter_count(to); ssize_t value; char *buf; if ((value = get_ready_ep(file->f_flags, epdata, false)) < 0) return value; /* halt any endpoint by doing a "wrong direction" i/o call */ if (usb_endpoint_dir_in(&epdata->desc)) { if (usb_endpoint_xfer_isoc(&epdata->desc) || !is_sync_kiocb(iocb)) { mutex_unlock(&epdata->lock); return -EINVAL; } DBG (epdata->dev, "%s halt\n", epdata->name); spin_lock_irq(&epdata->dev->lock); if (likely(epdata->ep != NULL)) usb_ep_set_halt(epdata->ep); spin_unlock_irq(&epdata->dev->lock); mutex_unlock(&epdata->lock); return -EBADMSG; } buf = kmalloc(len, GFP_KERNEL); if (unlikely(!buf)) { mutex_unlock(&epdata->lock); return -ENOMEM; } if (is_sync_kiocb(iocb)) { value = ep_io(epdata, buf, len); if (value >= 0 && (copy_to_iter(buf, value, to) != value)) value = -EFAULT; } else { struct kiocb_priv *priv = kzalloc(sizeof *priv, GFP_KERNEL); value = -ENOMEM; if (!priv) goto fail; priv->to_free = dup_iter(&priv->to, to, GFP_KERNEL); if (!iter_is_ubuf(&priv->to) && !priv->to_free) { kfree(priv); goto fail; } value = ep_aio(iocb, priv, epdata, buf, len); if (value == -EIOCBQUEUED) buf = NULL; } fail: kfree(buf); mutex_unlock(&epdata->lock); return value; } static ssize_t ep_config(struct ep_data *, const char *, size_t); static ssize_t ep_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct ep_data *epdata = file->private_data; size_t len = iov_iter_count(from); bool configured; ssize_t value; char *buf; if ((value = get_ready_ep(file->f_flags, epdata, true)) < 0) return value; configured = epdata->state == STATE_EP_ENABLED; /* halt any endpoint by doing a "wrong direction" i/o call */ if (configured && !usb_endpoint_dir_in(&epdata->desc)) { if (usb_endpoint_xfer_isoc(&epdata->desc) || !is_sync_kiocb(iocb)) { mutex_unlock(&epdata->lock); return -EINVAL; } DBG (epdata->dev, "%s halt\n", epdata->name); spin_lock_irq(&epdata->dev->lock); if (likely(epdata->ep != NULL)) usb_ep_set_halt(epdata->ep); spin_unlock_irq(&epdata->dev->lock); mutex_unlock(&epdata->lock); return -EBADMSG; } buf = kmalloc(len, GFP_KERNEL); if (unlikely(!buf)) { mutex_unlock(&epdata->lock); return -ENOMEM; } if (unlikely(!copy_from_iter_full(buf, len, from))) { value = -EFAULT; goto out; } if (unlikely(!configured)) { value = ep_config(epdata, buf, len); } else if (is_sync_kiocb(iocb)) { value = ep_io(epdata, buf, len); } else { struct kiocb_priv *priv = kzalloc(sizeof *priv, GFP_KERNEL); value = -ENOMEM; if (priv) { value = ep_aio(iocb, priv, epdata, buf, len); if (value == -EIOCBQUEUED) buf = NULL; } } out: kfree(buf); mutex_unlock(&epdata->lock); return value; } /*----------------------------------------------------------------------*/ /* used after endpoint configuration */ static const struct file_operations ep_io_operations = { .owner = THIS_MODULE, .open = ep_open, .release = ep_release, .unlocked_ioctl = ep_ioctl, .read_iter = ep_read_iter, .write_iter = ep_write_iter, }; /* ENDPOINT INITIALIZATION * * fd = open ("/dev/gadget/$ENDPOINT", O_RDWR) * status = write (fd, descriptors, sizeof descriptors) * * That write establishes the endpoint configuration, configuring * the controller to process bulk, interrupt, or isochronous transfers * at the right maxpacket size, and so on. * * The descriptors are message type 1, identified by a host order u32 * at the beginning of what's written. Descriptor order is: full/low * speed descriptor, then optional high speed descriptor. */ static ssize_t ep_config (struct ep_data *data, const char *buf, size_t len) { struct usb_ep *ep; u32 tag; int value, length = len; if (data->state != STATE_EP_READY) { value = -EL2HLT; goto fail; } value = len; if (len < USB_DT_ENDPOINT_SIZE + 4) goto fail0; /* we might need to change message format someday */ memcpy(&tag, buf, 4); if (tag != 1) { DBG(data->dev, "config %s, bad tag %d\n", data->name, tag); goto fail0; } buf += 4; len -= 4; /* NOTE: audio endpoint extensions not accepted here; * just don't include the extra bytes. */ /* full/low speed descriptor, then high speed */ memcpy(&data->desc, buf, USB_DT_ENDPOINT_SIZE); if (data->desc.bLength != USB_DT_ENDPOINT_SIZE || data->desc.bDescriptorType != USB_DT_ENDPOINT) goto fail0; if (len != USB_DT_ENDPOINT_SIZE) { if (len != 2 * USB_DT_ENDPOINT_SIZE) goto fail0; memcpy(&data->hs_desc, buf + USB_DT_ENDPOINT_SIZE, USB_DT_ENDPOINT_SIZE); if (data->hs_desc.bLength != USB_DT_ENDPOINT_SIZE || data->hs_desc.bDescriptorType != USB_DT_ENDPOINT) { DBG(data->dev, "config %s, bad hs length or type\n", data->name); goto fail0; } } spin_lock_irq (&data->dev->lock); if (data->dev->state == STATE_DEV_UNBOUND) { value = -ENOENT; goto gone; } else { ep = data->ep; if (ep == NULL) { value = -ENODEV; goto gone; } } switch (data->dev->gadget->speed) { case USB_SPEED_LOW: case USB_SPEED_FULL: ep->desc = &data->desc; break; case USB_SPEED_HIGH: /* fails if caller didn't provide that descriptor... */ ep->desc = &data->hs_desc; break; default: DBG(data->dev, "unconnected, %s init abandoned\n", data->name); value = -EINVAL; goto gone; } value = usb_ep_enable(ep); if (value == 0) { data->state = STATE_EP_ENABLED; value = length; } gone: spin_unlock_irq (&data->dev->lock); if (value < 0) { fail: data->desc.bDescriptorType = 0; data->hs_desc.bDescriptorType = 0; } return value; fail0: value = -EINVAL; goto fail; } static int ep_open (struct inode *inode, struct file *fd) { struct ep_data *data = inode->i_private; int value = -EBUSY; if (mutex_lock_interruptible(&data->lock) != 0) return -EINTR; spin_lock_irq (&data->dev->lock); if (data->dev->state == STATE_DEV_UNBOUND) value = -ENOENT; else if (data->state == STATE_EP_DISABLED) { value = 0; data->state = STATE_EP_READY; get_ep (data); fd->private_data = data; VDEBUG (data->dev, "%s ready\n", data->name); } else DBG (data->dev, "%s state %d\n", data->name, data->state); spin_unlock_irq (&data->dev->lock); mutex_unlock(&data->lock); return value; } /*----------------------------------------------------------------------*/ /* EP0 IMPLEMENTATION can be partly in userspace. * * Drivers that use this facility receive various events, including * control requests the kernel doesn't handle. Drivers that don't * use this facility may be too simple-minded for real applications. */ static inline void ep0_readable (struct dev_data *dev) { wake_up (&dev->wait); kill_fasync (&dev->fasync, SIGIO, POLL_IN); } static void clean_req (struct usb_ep *ep, struct usb_request *req) { struct dev_data *dev = ep->driver_data; if (req->buf != dev->rbuf) { kfree(req->buf); req->buf = dev->rbuf; } req->complete = epio_complete; dev->setup_out_ready = 0; } static void ep0_complete (struct usb_ep *ep, struct usb_request *req) { struct dev_data *dev = ep->driver_data; unsigned long flags; int free = 1; /* for control OUT, data must still get to userspace */ spin_lock_irqsave(&dev->lock, flags); if (!dev->setup_in) { dev->setup_out_error = (req->status != 0); if (!dev->setup_out_error) free = 0; dev->setup_out_ready = 1; ep0_readable (dev); } /* clean up as appropriate */ if (free && req->buf != &dev->rbuf) clean_req (ep, req); req->complete = epio_complete; spin_unlock_irqrestore(&dev->lock, flags); } static int setup_req (struct usb_ep *ep, struct usb_request *req, u16 len) { struct dev_data *dev = ep->driver_data; if (dev->setup_out_ready) { DBG (dev, "ep0 request busy!\n"); return -EBUSY; } if (len > sizeof (dev->rbuf)) req->buf = kmalloc(len, GFP_ATOMIC); if (req->buf == NULL) { req->buf = dev->rbuf; return -ENOMEM; } req->complete = ep0_complete; req->length = len; req->zero = 0; return 0; } static ssize_t ep0_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) { struct dev_data *dev = fd->private_data; ssize_t retval; enum ep0_state state; spin_lock_irq (&dev->lock); if (dev->state <= STATE_DEV_OPENED) { retval = -EINVAL; goto done; } /* report fd mode change before acting on it */ if (dev->setup_abort) { dev->setup_abort = 0; retval = -EIDRM; goto done; } /* control DATA stage */ if ((state = dev->state) == STATE_DEV_SETUP) { if (dev->setup_in) { /* stall IN */ VDEBUG(dev, "ep0in stall\n"); (void) usb_ep_set_halt (dev->gadget->ep0); retval = -EL2HLT; dev->state = STATE_DEV_CONNECTED; } else if (len == 0) { /* ack SET_CONFIGURATION etc */ struct usb_ep *ep = dev->gadget->ep0; struct usb_request *req = dev->req; if ((retval = setup_req (ep, req, 0)) == 0) { ++dev->udc_usage; spin_unlock_irq (&dev->lock); retval = usb_ep_queue (ep, req, GFP_KERNEL); spin_lock_irq (&dev->lock); --dev->udc_usage; } dev->state = STATE_DEV_CONNECTED; /* assume that was SET_CONFIGURATION */ if (dev->current_config) { unsigned power; if (gadget_is_dualspeed(dev->gadget) && (dev->gadget->speed == USB_SPEED_HIGH)) power = dev->hs_config->bMaxPower; else power = dev->config->bMaxPower; usb_gadget_vbus_draw(dev->gadget, 2 * power); } } else { /* collect OUT data */ if ((fd->f_flags & O_NONBLOCK) != 0 && !dev->setup_out_ready) { retval = -EAGAIN; goto done; } spin_unlock_irq (&dev->lock); retval = wait_event_interruptible (dev->wait, dev->setup_out_ready != 0); /* FIXME state could change from under us */ spin_lock_irq (&dev->lock); if (retval) goto done; if (dev->state != STATE_DEV_SETUP) { retval = -ECANCELED; goto done; } dev->state = STATE_DEV_CONNECTED; if (dev->setup_out_error) retval = -EIO; else { len = min (len, (size_t)dev->req->actual); ++dev->udc_usage; spin_unlock_irq(&dev->lock); if (copy_to_user (buf, dev->req->buf, len)) retval = -EFAULT; else retval = len; spin_lock_irq(&dev->lock); --dev->udc_usage; clean_req (dev->gadget->ep0, dev->req); /* NOTE userspace can't yet choose to stall */ } } goto done; } /* else normal: return event data */ if (len < sizeof dev->event [0]) { retval = -EINVAL; goto done; } len -= len % sizeof (struct usb_gadgetfs_event); dev->usermode_setup = 1; scan: /* return queued events right away */ if (dev->ev_next != 0) { unsigned i, n; n = len / sizeof (struct usb_gadgetfs_event); if (dev->ev_next < n) n = dev->ev_next; /* ep0 i/o has special semantics during STATE_DEV_SETUP */ for (i = 0; i < n; i++) { if (dev->event [i].type == GADGETFS_SETUP) { dev->state = STATE_DEV_SETUP; n = i + 1; break; } } spin_unlock_irq (&dev->lock); len = n * sizeof (struct usb_gadgetfs_event); if (copy_to_user (buf, &dev->event, len)) retval = -EFAULT; else retval = len; if (len > 0) { /* NOTE this doesn't guard against broken drivers; * concurrent ep0 readers may lose events. */ spin_lock_irq (&dev->lock); if (dev->ev_next > n) { memmove(&dev->event[0], &dev->event[n], sizeof (struct usb_gadgetfs_event) * (dev->ev_next - n)); } dev->ev_next -= n; spin_unlock_irq (&dev->lock); } return retval; } if (fd->f_flags & O_NONBLOCK) { retval = -EAGAIN; goto done; } switch (state) { default: DBG (dev, "fail %s, state %d\n", __func__, state); retval = -ESRCH; break; case STATE_DEV_UNCONNECTED: case STATE_DEV_CONNECTED: spin_unlock_irq (&dev->lock); DBG (dev, "%s wait\n", __func__); /* wait for events */ retval = wait_event_interruptible (dev->wait, dev->ev_next != 0); if (retval < 0) return retval; spin_lock_irq (&dev->lock); goto scan; } done: spin_unlock_irq (&dev->lock); return retval; } static struct usb_gadgetfs_event * next_event (struct dev_data *dev, enum usb_gadgetfs_event_type type) { struct usb_gadgetfs_event *event; unsigned i; switch (type) { /* these events purge the queue */ case GADGETFS_DISCONNECT: if (dev->state == STATE_DEV_SETUP) dev->setup_abort = 1; fallthrough; case GADGETFS_CONNECT: dev->ev_next = 0; break; case GADGETFS_SETUP: /* previous request timed out */ case GADGETFS_SUSPEND: /* same effect */ /* these events can't be repeated */ for (i = 0; i != dev->ev_next; i++) { if (dev->event [i].type != type) continue; DBG(dev, "discard old event[%d] %d\n", i, type); dev->ev_next--; if (i == dev->ev_next) break; /* indices start at zero, for simplicity */ memmove (&dev->event [i], &dev->event [i + 1], sizeof (struct usb_gadgetfs_event) * (dev->ev_next - i)); } break; default: BUG (); } VDEBUG(dev, "event[%d] = %d\n", dev->ev_next, type); event = &dev->event [dev->ev_next++]; BUG_ON (dev->ev_next > N_EVENT); memset (event, 0, sizeof *event); event->type = type; return event; } static ssize_t ep0_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) { struct dev_data *dev = fd->private_data; ssize_t retval = -ESRCH; /* report fd mode change before acting on it */ if (dev->setup_abort) { dev->setup_abort = 0; retval = -EIDRM; /* data and/or status stage for control request */ } else if (dev->state == STATE_DEV_SETUP) { len = min_t(size_t, len, dev->setup_wLength); if (dev->setup_in) { retval = setup_req (dev->gadget->ep0, dev->req, len); if (retval == 0) { dev->state = STATE_DEV_CONNECTED; ++dev->udc_usage; spin_unlock_irq (&dev->lock); if (copy_from_user (dev->req->buf, buf, len)) retval = -EFAULT; else { if (len < dev->setup_wLength) dev->req->zero = 1; retval = usb_ep_queue ( dev->gadget->ep0, dev->req, GFP_KERNEL); } spin_lock_irq(&dev->lock); --dev->udc_usage; if (retval < 0) { clean_req (dev->gadget->ep0, dev->req); } else retval = len; return retval; } /* can stall some OUT transfers */ } else if (dev->setup_can_stall) { VDEBUG(dev, "ep0out stall\n"); (void) usb_ep_set_halt (dev->gadget->ep0); retval = -EL2HLT; dev->state = STATE_DEV_CONNECTED; } else { DBG(dev, "bogus ep0out stall!\n"); } } else DBG (dev, "fail %s, state %d\n", __func__, dev->state); return retval; } static int ep0_fasync (int f, struct file *fd, int on) { struct dev_data *dev = fd->private_data; // caller must F_SETOWN before signal delivery happens VDEBUG(dev, "%s %s\n", __func__, str_on_off(on)); return fasync_helper (f, fd, on, &dev->fasync); } static struct usb_gadget_driver gadgetfs_driver; static int dev_release (struct inode *inode, struct file *fd) { struct dev_data *dev = fd->private_data; /* closing ep0 === shutdown all */ if (dev->gadget_registered) { usb_gadget_unregister_driver (&gadgetfs_driver); dev->gadget_registered = false; } /* at this point "good" hardware has disconnected the * device from USB; the host won't see it any more. * alternatively, all host requests will time out. */ kfree (dev->buf); dev->buf = NULL; /* other endpoints were all decoupled from this device */ spin_lock_irq(&dev->lock); dev->state = STATE_DEV_DISABLED; spin_unlock_irq(&dev->lock); put_dev (dev); return 0; } static __poll_t ep0_poll (struct file *fd, poll_table *wait) { struct dev_data *dev = fd->private_data; __poll_t mask = 0; if (dev->state <= STATE_DEV_OPENED) return DEFAULT_POLLMASK; poll_wait(fd, &dev->wait, wait); spin_lock_irq(&dev->lock); /* report fd mode change before acting on it */ if (dev->setup_abort) { dev->setup_abort = 0; mask = EPOLLHUP; goto out; } if (dev->state == STATE_DEV_SETUP) { if (dev->setup_in || dev->setup_can_stall) mask = EPOLLOUT; } else { if (dev->ev_next != 0) mask = EPOLLIN; } out: spin_unlock_irq(&dev->lock); return mask; } static long gadget_dev_ioctl (struct file *fd, unsigned code, unsigned long value) { struct dev_data *dev = fd->private_data; struct usb_gadget *gadget = dev->gadget; long ret = -ENOTTY; spin_lock_irq(&dev->lock); if (dev->state == STATE_DEV_OPENED || dev->state == STATE_DEV_UNBOUND) { /* Not bound to a UDC */ } else if (gadget->ops->ioctl) { ++dev->udc_usage; spin_unlock_irq(&dev->lock); ret = gadget->ops->ioctl (gadget, code, value); spin_lock_irq(&dev->lock); --dev->udc_usage; } spin_unlock_irq(&dev->lock); return ret; } /*----------------------------------------------------------------------*/ /* The in-kernel gadget driver handles most ep0 issues, in particular * enumerating the single configuration (as provided from user space). * * Unrecognized ep0 requests may be handled in user space. */ static void make_qualifier (struct dev_data *dev) { struct usb_qualifier_descriptor qual; struct usb_device_descriptor *desc; qual.bLength = sizeof qual; qual.bDescriptorType = USB_DT_DEVICE_QUALIFIER; qual.bcdUSB = cpu_to_le16 (0x0200); desc = dev->dev; qual.bDeviceClass = desc->bDeviceClass; qual.bDeviceSubClass = desc->bDeviceSubClass; qual.bDeviceProtocol = desc->bDeviceProtocol; /* assumes ep0 uses the same value for both speeds ... */ qual.bMaxPacketSize0 = dev->gadget->ep0->maxpacket; qual.bNumConfigurations = 1; qual.bRESERVED = 0; memcpy (dev->rbuf, &qual, sizeof qual); } static int config_buf (struct dev_data *dev, u8 type, unsigned index) { int len; int hs = 0; /* only one configuration */ if (index > 0) return -EINVAL; if (gadget_is_dualspeed(dev->gadget)) { hs = (dev->gadget->speed == USB_SPEED_HIGH); if (type == USB_DT_OTHER_SPEED_CONFIG) hs = !hs; } if (hs) { dev->req->buf = dev->hs_config; len = le16_to_cpu(dev->hs_config->wTotalLength); } else { dev->req->buf = dev->config; len = le16_to_cpu(dev->config->wTotalLength); } ((u8 *)dev->req->buf) [1] = type; return len; } static int gadgetfs_setup (struct usb_gadget *gadget, const struct usb_ctrlrequest *ctrl) { struct dev_data *dev = get_gadget_data (gadget); struct usb_request *req = dev->req; int value = -EOPNOTSUPP; struct usb_gadgetfs_event *event; u16 w_value = le16_to_cpu(ctrl->wValue); u16 w_length = le16_to_cpu(ctrl->wLength); if (w_length > RBUF_SIZE) { if (ctrl->bRequestType & USB_DIR_IN) { /* Cast away the const, we are going to overwrite on purpose. */ __le16 *temp = (__le16 *)&ctrl->wLength; *temp = cpu_to_le16(RBUF_SIZE); w_length = RBUF_SIZE; } else { return value; } } spin_lock (&dev->lock); dev->setup_abort = 0; if (dev->state == STATE_DEV_UNCONNECTED) { if (gadget_is_dualspeed(gadget) && gadget->speed == USB_SPEED_HIGH && dev->hs_config == NULL) { spin_unlock(&dev->lock); ERROR (dev, "no high speed config??\n"); return -EINVAL; } dev->state = STATE_DEV_CONNECTED; INFO (dev, "connected\n"); event = next_event (dev, GADGETFS_CONNECT); event->u.speed = gadget->speed; ep0_readable (dev); /* host may have given up waiting for response. we can miss control * requests handled lower down (device/endpoint status and features); * then ep0_{read,write} will report the wrong status. controller * driver will have aborted pending i/o. */ } else if (dev->state == STATE_DEV_SETUP) dev->setup_abort = 1; req->buf = dev->rbuf; req->context = NULL; switch (ctrl->bRequest) { case USB_REQ_GET_DESCRIPTOR: if (ctrl->bRequestType != USB_DIR_IN) goto unrecognized; switch (w_value >> 8) { case USB_DT_DEVICE: value = min (w_length, (u16) sizeof *dev->dev); dev->dev->bMaxPacketSize0 = dev->gadget->ep0->maxpacket; req->buf = dev->dev; break; case USB_DT_DEVICE_QUALIFIER: if (!dev->hs_config) break; value = min (w_length, (u16) sizeof (struct usb_qualifier_descriptor)); make_qualifier (dev); break; case USB_DT_OTHER_SPEED_CONFIG: case USB_DT_CONFIG: value = config_buf (dev, w_value >> 8, w_value & 0xff); if (value >= 0) value = min (w_length, (u16) value); break; case USB_DT_STRING: goto unrecognized; default: // all others are errors break; } break; /* currently one config, two speeds */ case USB_REQ_SET_CONFIGURATION: if (ctrl->bRequestType != 0) goto unrecognized; if (0 == (u8) w_value) { value = 0; dev->current_config = 0; usb_gadget_vbus_draw(gadget, 8 /* mA */ ); // user mode expected to disable endpoints } else { u8 config, power; if (gadget_is_dualspeed(gadget) && gadget->speed == USB_SPEED_HIGH) { config = dev->hs_config->bConfigurationValue; power = dev->hs_config->bMaxPower; } else { config = dev->config->bConfigurationValue; power = dev->config->bMaxPower; } if (config == (u8) w_value) { value = 0; dev->current_config = config; usb_gadget_vbus_draw(gadget, 2 * power); } } /* report SET_CONFIGURATION like any other control request, * except that usermode may not stall this. the next * request mustn't be allowed start until this finishes: * endpoints and threads set up, etc. * * NOTE: older PXA hardware (before PXA 255: without UDCCFR) * has bad/racey automagic that prevents synchronizing here. * even kernel mode drivers often miss them. */ if (value == 0) { INFO (dev, "configuration #%d\n", dev->current_config); usb_gadget_set_state(gadget, USB_STATE_CONFIGURED); if (dev->usermode_setup) { dev->setup_can_stall = 0; goto delegate; } } break; #ifndef CONFIG_USB_PXA25X /* PXA automagically handles this request too */ case USB_REQ_GET_CONFIGURATION: if (ctrl->bRequestType != 0x80) goto unrecognized; *(u8 *)req->buf = dev->current_config; value = min (w_length, (u16) 1); break; #endif default: unrecognized: VDEBUG (dev, "%s req%02x.%02x v%04x i%04x l%d\n", dev->usermode_setup ? "delegate" : "fail", ctrl->bRequestType, ctrl->bRequest, w_value, le16_to_cpu(ctrl->wIndex), w_length); /* if there's an ep0 reader, don't stall */ if (dev->usermode_setup) { dev->setup_can_stall = 1; delegate: dev->setup_in = (ctrl->bRequestType & USB_DIR_IN) ? 1 : 0; dev->setup_wLength = w_length; dev->setup_out_ready = 0; dev->setup_out_error = 0; /* read DATA stage for OUT right away */ if (unlikely (!dev->setup_in && w_length)) { value = setup_req (gadget->ep0, dev->req, w_length); if (value < 0) break; ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, dev->req, GFP_KERNEL); spin_lock (&dev->lock); --dev->udc_usage; if (value < 0) { clean_req (gadget->ep0, dev->req); break; } /* we can't currently stall these */ dev->setup_can_stall = 0; } /* state changes when reader collects event */ event = next_event (dev, GADGETFS_SETUP); event->u.setup = *ctrl; ep0_readable (dev); spin_unlock (&dev->lock); /* * Return USB_GADGET_DELAYED_STATUS as a workaround to * stop some UDC drivers (e.g. dwc3) from automatically * proceeding with the status stage for 0-length * transfers. * Should be removed once all UDC drivers are fixed to * always delay the status stage until a response is * queued to EP0. */ return w_length == 0 ? USB_GADGET_DELAYED_STATUS : 0; } } /* proceed with data transfer and status phases? */ if (value >= 0 && dev->state != STATE_DEV_SETUP) { req->length = value; req->zero = value < w_length; ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, req, GFP_KERNEL); spin_lock(&dev->lock); --dev->udc_usage; spin_unlock(&dev->lock); if (value < 0) { DBG (dev, "ep_queue --> %d\n", value); req->status = 0; } return value; } /* device stalls when value < 0 */ spin_unlock (&dev->lock); return value; } static void destroy_ep_files (struct dev_data *dev) { DBG (dev, "%s %d\n", __func__, dev->state); /* dev->state must prevent interference */ spin_lock_irq (&dev->lock); while (!list_empty(&dev->epfiles)) { struct ep_data *ep; struct inode *parent; struct dentry *dentry; /* break link to FS */ ep = list_first_entry (&dev->epfiles, struct ep_data, epfiles); list_del_init (&ep->epfiles); spin_unlock_irq (&dev->lock); dentry = ep->dentry; ep->dentry = NULL; parent = d_inode(dentry->d_parent); /* break link to controller */ mutex_lock(&ep->lock); if (ep->state == STATE_EP_ENABLED) (void) usb_ep_disable (ep->ep); ep->state = STATE_EP_UNBOUND; usb_ep_free_request (ep->ep, ep->req); ep->ep = NULL; mutex_unlock(&ep->lock); wake_up (&ep->wait); put_ep (ep); /* break link to dcache */ inode_lock(parent); d_delete (dentry); dput (dentry); inode_unlock(parent); spin_lock_irq (&dev->lock); } spin_unlock_irq (&dev->lock); } static struct dentry * gadgetfs_create_file (struct super_block *sb, char const *name, void *data, const struct file_operations *fops); static int activate_ep_files (struct dev_data *dev) { struct usb_ep *ep; struct ep_data *data; gadget_for_each_ep (ep, dev->gadget) { data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto enomem0; data->state = STATE_EP_DISABLED; mutex_init(&data->lock); init_waitqueue_head (&data->wait); strncpy (data->name, ep->name, sizeof (data->name) - 1); refcount_set (&data->count, 1); data->dev = dev; get_dev (dev); data->ep = ep; ep->driver_data = data; data->req = usb_ep_alloc_request (ep, GFP_KERNEL); if (!data->req) goto enomem1; data->dentry = gadgetfs_create_file (dev->sb, data->name, data, &ep_io_operations); if (!data->dentry) goto enomem2; list_add_tail (&data->epfiles, &dev->epfiles); } return 0; enomem2: usb_ep_free_request (ep, data->req); enomem1: put_dev (dev); kfree (data); enomem0: DBG (dev, "%s enomem\n", __func__); destroy_ep_files (dev); return -ENOMEM; } static void gadgetfs_unbind (struct usb_gadget *gadget) { struct dev_data *dev = get_gadget_data (gadget); DBG (dev, "%s\n", __func__); spin_lock_irq (&dev->lock); dev->state = STATE_DEV_UNBOUND; while (dev->udc_usage > 0) { spin_unlock_irq(&dev->lock); usleep_range(1000, 2000); spin_lock_irq(&dev->lock); } spin_unlock_irq (&dev->lock); destroy_ep_files (dev); gadget->ep0->driver_data = NULL; set_gadget_data (gadget, NULL); /* we've already been disconnected ... no i/o is active */ if (dev->req) usb_ep_free_request (gadget->ep0, dev->req); DBG (dev, "%s done\n", __func__); put_dev (dev); } static struct dev_data *the_device; static int gadgetfs_bind(struct usb_gadget *gadget, struct usb_gadget_driver *driver) { struct dev_data *dev = the_device; if (!dev) return -ESRCH; if (0 != strcmp (CHIP, gadget->name)) { pr_err("%s expected %s controller not %s\n", shortname, CHIP, gadget->name); return -ENODEV; } set_gadget_data (gadget, dev); dev->gadget = gadget; gadget->ep0->driver_data = dev; /* preallocate control response and buffer */ dev->req = usb_ep_alloc_request (gadget->ep0, GFP_KERNEL); if (!dev->req) goto enomem; dev->req->context = NULL; dev->req->complete = epio_complete; if (activate_ep_files (dev) < 0) goto enomem; INFO (dev, "bound to %s driver\n", gadget->name); spin_lock_irq(&dev->lock); dev->state = STATE_DEV_UNCONNECTED; spin_unlock_irq(&dev->lock); get_dev (dev); return 0; enomem: gadgetfs_unbind (gadget); return -ENOMEM; } static void gadgetfs_disconnect (struct usb_gadget *gadget) { struct dev_data *dev = get_gadget_data (gadget); unsigned long flags; spin_lock_irqsave (&dev->lock, flags); if (dev->state == STATE_DEV_UNCONNECTED) goto exit; dev->state = STATE_DEV_UNCONNECTED; INFO (dev, "disconnected\n"); next_event (dev, GADGETFS_DISCONNECT); ep0_readable (dev); exit: spin_unlock_irqrestore (&dev->lock, flags); } static void gadgetfs_suspend (struct usb_gadget *gadget) { struct dev_data *dev = get_gadget_data (gadget); unsigned long flags; INFO (dev, "suspended from state %d\n", dev->state); spin_lock_irqsave(&dev->lock, flags); switch (dev->state) { case STATE_DEV_SETUP: // VERY odd... host died?? case STATE_DEV_CONNECTED: case STATE_DEV_UNCONNECTED: next_event (dev, GADGETFS_SUSPEND); ep0_readable (dev); fallthrough; default: break; } spin_unlock_irqrestore(&dev->lock, flags); } static struct usb_gadget_driver gadgetfs_driver = { .function = (char *) driver_desc, .bind = gadgetfs_bind, .unbind = gadgetfs_unbind, .setup = gadgetfs_setup, .reset = gadgetfs_disconnect, .disconnect = gadgetfs_disconnect, .suspend = gadgetfs_suspend, .driver = { .name = shortname, }, }; /*----------------------------------------------------------------------*/ /* DEVICE INITIALIZATION * * fd = open ("/dev/gadget/$CHIP", O_RDWR) * status = write (fd, descriptors, sizeof descriptors) * * That write establishes the device configuration, so the kernel can * bind to the controller ... guaranteeing it can handle enumeration * at all necessary speeds. Descriptor order is: * * . message tag (u32, host order) ... for now, must be zero; it * would change to support features like multi-config devices * . full/low speed config ... all wTotalLength bytes (with interface, * class, altsetting, endpoint, and other descriptors) * . high speed config ... all descriptors, for high speed operation; * this one's optional except for high-speed hardware * . device descriptor * * Endpoints are not yet enabled. Drivers must wait until device * configuration and interface altsetting changes create * the need to configure (or unconfigure) them. * * After initialization, the device stays active for as long as that * $CHIP file is open. Events must then be read from that descriptor, * such as configuration notifications. */ static int is_valid_config(struct usb_config_descriptor *config, unsigned int total) { return config->bDescriptorType == USB_DT_CONFIG && config->bLength == USB_DT_CONFIG_SIZE && total >= USB_DT_CONFIG_SIZE && config->bConfigurationValue != 0 && (config->bmAttributes & USB_CONFIG_ATT_ONE) != 0 && (config->bmAttributes & USB_CONFIG_ATT_WAKEUP) == 0; /* FIXME if gadget->is_otg, _must_ include an otg descriptor */ /* FIXME check lengths: walk to end */ } static ssize_t dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) { struct dev_data *dev = fd->private_data; ssize_t value, length = len; unsigned total; u32 tag; char *kbuf; spin_lock_irq(&dev->lock); if (dev->state > STATE_DEV_OPENED) { value = ep0_write(fd, buf, len, ptr); spin_unlock_irq(&dev->lock); return value; } spin_unlock_irq(&dev->lock); if ((len < (USB_DT_CONFIG_SIZE + USB_DT_DEVICE_SIZE + 4)) || (len > PAGE_SIZE * 4)) return -EINVAL; /* we might need to change message format someday */ if (copy_from_user (&tag, buf, 4)) return -EFAULT; if (tag != 0) return -EINVAL; buf += 4; length -= 4; kbuf = memdup_user(buf, length); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); spin_lock_irq (&dev->lock); value = -EINVAL; if (dev->buf) { spin_unlock_irq(&dev->lock); kfree(kbuf); return value; } dev->buf = kbuf; /* full or low speed config */ dev->config = (void *) kbuf; total = le16_to_cpu(dev->config->wTotalLength); if (!is_valid_config(dev->config, total) || total > length - USB_DT_DEVICE_SIZE) goto fail; kbuf += total; length -= total; /* optional high speed config */ if (kbuf [1] == USB_DT_CONFIG) { dev->hs_config = (void *) kbuf; total = le16_to_cpu(dev->hs_config->wTotalLength); if (!is_valid_config(dev->hs_config, total) || total > length - USB_DT_DEVICE_SIZE) goto fail; kbuf += total; length -= total; } else { dev->hs_config = NULL; } /* could support multiple configs, using another encoding! */ /* device descriptor (tweaked for paranoia) */ if (length != USB_DT_DEVICE_SIZE) goto fail; dev->dev = (void *)kbuf; if (dev->dev->bLength != USB_DT_DEVICE_SIZE || dev->dev->bDescriptorType != USB_DT_DEVICE || dev->dev->bNumConfigurations != 1) goto fail; dev->dev->bcdUSB = cpu_to_le16 (0x0200); /* triggers gadgetfs_bind(); then we can enumerate. */ spin_unlock_irq (&dev->lock); if (dev->hs_config) gadgetfs_driver.max_speed = USB_SPEED_HIGH; else gadgetfs_driver.max_speed = USB_SPEED_FULL; value = usb_gadget_register_driver(&gadgetfs_driver); if (value != 0) { spin_lock_irq(&dev->lock); goto fail; } else { /* at this point "good" hardware has for the first time * let the USB the host see us. alternatively, if users * unplug/replug that will clear all the error state. * * note: everything running before here was guaranteed * to choke driver model style diagnostics. from here * on, they can work ... except in cleanup paths that * kick in after the ep0 descriptor is closed. */ value = len; dev->gadget_registered = true; } return value; fail: dev->config = NULL; dev->hs_config = NULL; dev->dev = NULL; spin_unlock_irq (&dev->lock); pr_debug ("%s: %s fail %zd, %p\n", shortname, __func__, value, dev); kfree (dev->buf); dev->buf = NULL; return value; } static int gadget_dev_open (struct inode *inode, struct file *fd) { struct dev_data *dev = inode->i_private; int value = -EBUSY; spin_lock_irq(&dev->lock); if (dev->state == STATE_DEV_DISABLED) { dev->ev_next = 0; dev->state = STATE_DEV_OPENED; fd->private_data = dev; get_dev (dev); value = 0; } spin_unlock_irq(&dev->lock); return value; } static const struct file_operations ep0_operations = { .open = gadget_dev_open, .read = ep0_read, .write = dev_config, .fasync = ep0_fasync, .poll = ep0_poll, .unlocked_ioctl = gadget_dev_ioctl, .release = dev_release, }; /*----------------------------------------------------------------------*/ /* FILESYSTEM AND SUPERBLOCK OPERATIONS * * Mounting the filesystem creates a controller file, used first for * device configuration then later for event monitoring. */ /* FIXME PAM etc could set this security policy without mount options * if epfiles inherited ownership and permissons from ep0 ... */ static unsigned default_uid; static unsigned default_gid; static unsigned default_perm = S_IRUSR | S_IWUSR; module_param (default_uid, uint, 0644); module_param (default_gid, uint, 0644); module_param (default_perm, uint, 0644); static struct inode * gadgetfs_make_inode (struct super_block *sb, void *data, const struct file_operations *fops, int mode) { struct inode *inode = new_inode (sb); if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, default_uid); inode->i_gid = make_kgid(&init_user_ns, default_gid); simple_inode_init_ts(inode); inode->i_private = data; inode->i_fop = fops; } return inode; } /* creates in fs root directory, so non-renamable and non-linkable. * so inode and dentry are paired, until device reconfig. */ static struct dentry * gadgetfs_create_file (struct super_block *sb, char const *name, void *data, const struct file_operations *fops) { struct dentry *dentry; struct inode *inode; dentry = d_alloc_name(sb->s_root, name); if (!dentry) return NULL; inode = gadgetfs_make_inode (sb, data, fops, S_IFREG | (default_perm & S_IRWXUGO)); if (!inode) { dput(dentry); return NULL; } d_add (dentry, inode); return dentry; } static const struct super_operations gadget_fs_operations = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, }; static int gadgetfs_fill_super (struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct dev_data *dev; int rc; mutex_lock(&sb_mutex); if (the_device) { rc = -ESRCH; goto Done; } CHIP = usb_get_gadget_udc_name(); if (!CHIP) { rc = -ENODEV; goto Done; } /* superblock */ sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = GADGETFS_MAGIC; sb->s_op = &gadget_fs_operations; sb->s_time_gran = 1; /* root inode */ inode = gadgetfs_make_inode (sb, NULL, &simple_dir_operations, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) goto Enomem; inode->i_op = &simple_dir_inode_operations; if (!(sb->s_root = d_make_root (inode))) goto Enomem; /* the ep0 file is named after the controller we expect; * user mode code can use it for sanity checks, like we do. */ dev = dev_new (); if (!dev) goto Enomem; dev->sb = sb; dev->dentry = gadgetfs_create_file(sb, CHIP, dev, &ep0_operations); if (!dev->dentry) { put_dev(dev); goto Enomem; } /* other endpoint files are available after hardware setup, * from binding to a controller. */ the_device = dev; rc = 0; goto Done; Enomem: kfree(CHIP); CHIP = NULL; rc = -ENOMEM; Done: mutex_unlock(&sb_mutex); return rc; } /* "mount -t gadgetfs path /dev/gadget" ends up here */ static int gadgetfs_get_tree(struct fs_context *fc) { return get_tree_single(fc, gadgetfs_fill_super); } static const struct fs_context_operations gadgetfs_context_ops = { .get_tree = gadgetfs_get_tree, }; static int gadgetfs_init_fs_context(struct fs_context *fc) { fc->ops = &gadgetfs_context_ops; return 0; } static void gadgetfs_kill_sb (struct super_block *sb) { mutex_lock(&sb_mutex); kill_litter_super (sb); if (the_device) { put_dev (the_device); the_device = NULL; } kfree(CHIP); CHIP = NULL; mutex_unlock(&sb_mutex); } /*----------------------------------------------------------------------*/ static struct file_system_type gadgetfs_type = { .owner = THIS_MODULE, .name = shortname, .init_fs_context = gadgetfs_init_fs_context, .kill_sb = gadgetfs_kill_sb, }; MODULE_ALIAS_FS("gadgetfs"); /*----------------------------------------------------------------------*/ static int __init gadgetfs_init (void) { int status; status = register_filesystem (&gadgetfs_type); if (status == 0) pr_info ("%s: %s, version " DRIVER_VERSION "\n", shortname, driver_desc); return status; } module_init (gadgetfs_init); static void __exit gadgetfs_cleanup (void) { pr_debug ("unregister %s\n", shortname); unregister_filesystem (&gadgetfs_type); } module_exit (gadgetfs_cleanup); |
165 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | /* SPDX-License-Identifier: GPL-2.0+ */ #undef TRACE_SYSTEM #define TRACE_SYSTEM rseq #if !defined(_TRACE_RSEQ_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RSEQ_H #include <linux/tracepoint.h> #include <linux/types.h> TRACE_EVENT(rseq_update, TP_PROTO(struct task_struct *t), TP_ARGS(t), TP_STRUCT__entry( __field(s32, cpu_id) __field(s32, node_id) __field(s32, mm_cid) ), TP_fast_assign( __entry->cpu_id = raw_smp_processor_id(); __entry->node_id = cpu_to_node(__entry->cpu_id); __entry->mm_cid = task_mm_cid(t); ), TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id, __entry->node_id, __entry->mm_cid) ); TRACE_EVENT(rseq_ip_fixup, TP_PROTO(unsigned long regs_ip, unsigned long start_ip, unsigned long post_commit_offset, unsigned long abort_ip), TP_ARGS(regs_ip, start_ip, post_commit_offset, abort_ip), TP_STRUCT__entry( __field(unsigned long, regs_ip) __field(unsigned long, start_ip) __field(unsigned long, post_commit_offset) __field(unsigned long, abort_ip) ), TP_fast_assign( __entry->regs_ip = regs_ip; __entry->start_ip = start_ip; __entry->post_commit_offset = post_commit_offset; __entry->abort_ip = abort_ip; ), TP_printk("regs_ip=0x%lx start_ip=0x%lx post_commit_offset=%lu abort_ip=0x%lx", __entry->regs_ip, __entry->start_ip, __entry->post_commit_offset, __entry->abort_ip) ); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
37 37 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | // SPDX-License-Identifier: GPL-2.0-only /* * CAIF Framing Layer. * * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/crc-ccitt.h> #include <linux/netdevice.h> #include <net/caif/caif_layer.h> #include <net/caif/cfpkt.h> #include <net/caif/cffrml.h> #define container_obj(layr) container_of(layr, struct cffrml, layer) struct cffrml { struct cflayer layer; bool dofcs; /* !< FCS active */ int __percpu *pcpu_refcnt; }; static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt); static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt); static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid); static u32 cffrml_rcv_error; static u32 cffrml_rcv_checsum_error; struct cflayer *cffrml_create(u16 phyid, bool use_fcs) { struct cffrml *this = kzalloc(sizeof(struct cffrml), GFP_ATOMIC); if (!this) return NULL; this->pcpu_refcnt = alloc_percpu(int); if (this->pcpu_refcnt == NULL) { kfree(this); return NULL; } caif_assert(offsetof(struct cffrml, layer) == 0); this->layer.receive = cffrml_receive; this->layer.transmit = cffrml_transmit; this->layer.ctrlcmd = cffrml_ctrlcmd; snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "frm%d", phyid); this->dofcs = use_fcs; this->layer.id = phyid; return (struct cflayer *) this; } void cffrml_free(struct cflayer *layer) { struct cffrml *this = container_obj(layer); free_percpu(this->pcpu_refcnt); kfree(layer); } void cffrml_set_uplayer(struct cflayer *this, struct cflayer *up) { this->up = up; } void cffrml_set_dnlayer(struct cflayer *this, struct cflayer *dn) { this->dn = dn; } static u16 cffrml_checksum(u16 chks, void *buf, u16 len) { /* FIXME: FCS should be moved to glue in order to use OS-Specific * solutions */ return crc_ccitt(chks, buf, len); } static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt) { u16 tmp; u16 len; u16 hdrchks; int pktchks; struct cffrml *this; this = container_obj(layr); cfpkt_extr_head(pkt, &tmp, 2); len = le16_to_cpu(tmp); /* Subtract for FCS on length if FCS is not used. */ if (!this->dofcs) len -= 2; if (cfpkt_setlen(pkt, len) < 0) { ++cffrml_rcv_error; pr_err("Framing length error (%d)\n", len); cfpkt_destroy(pkt); return -EPROTO; } /* * Don't do extract if FCS is false, rather do setlen - then we don't * get a cache-miss. */ if (this->dofcs) { cfpkt_extr_trail(pkt, &tmp, 2); hdrchks = le16_to_cpu(tmp); pktchks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff); if (pktchks != hdrchks) { cfpkt_add_trail(pkt, &tmp, 2); ++cffrml_rcv_error; ++cffrml_rcv_checsum_error; pr_info("Frame checksum error (0x%x != 0x%x)\n", hdrchks, pktchks); return -EILSEQ; } } if (cfpkt_erroneous(pkt)) { ++cffrml_rcv_error; pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } if (layr->up == NULL) { pr_err("Layr up is missing!\n"); cfpkt_destroy(pkt); return -EINVAL; } return layr->up->receive(layr->up, pkt); } static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt) { u16 chks; u16 len; __le16 data; struct cffrml *this = container_obj(layr); if (this->dofcs) { chks = cfpkt_iterate(pkt, cffrml_checksum, 0xffff); data = cpu_to_le16(chks); cfpkt_add_trail(pkt, &data, 2); } else { cfpkt_pad_trail(pkt, 2); } len = cfpkt_getlen(pkt); data = cpu_to_le16(len); cfpkt_add_head(pkt, &data, 2); cfpkt_info(pkt)->hdr_len += 2; if (cfpkt_erroneous(pkt)) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } if (layr->dn == NULL) { cfpkt_destroy(pkt); return -ENODEV; } return layr->dn->transmit(layr->dn, pkt); } static void cffrml_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { if (layr->up && layr->up->ctrlcmd) layr->up->ctrlcmd(layr->up, ctrl, layr->id); } void cffrml_put(struct cflayer *layr) { struct cffrml *this = container_obj(layr); if (layr != NULL && this->pcpu_refcnt != NULL) this_cpu_dec(*this->pcpu_refcnt); } void cffrml_hold(struct cflayer *layr) { struct cffrml *this = container_obj(layr); if (layr != NULL && this->pcpu_refcnt != NULL) this_cpu_inc(*this->pcpu_refcnt); } int cffrml_refcnt_read(struct cflayer *layr) { int i, refcnt = 0; struct cffrml *this = container_obj(layr); for_each_possible_cpu(i) refcnt += *per_cpu_ptr(this->pcpu_refcnt, i); return refcnt; } |
5 193 9 51 92 36 5 41 1 5 36 41 1 1878 1841 37 13 3 602 600 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/gen_estimator.c Simple rate estimator. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Eric Dumazet <edumazet@google.com> * * Changes: * Jamal Hadi Salim - moved it to net/core and reshulfed * names to make it usable in general net subsystem. */ #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/seqlock.h> #include <net/sock.h> #include <net/gen_stats.h> /* This code is NOT intended to be used for statistics collection, * its purpose is to provide a base for statistical multiplexing * for controlled load service. * If you need only statistics, run a user level daemon which * periodically reads byte counters. */ struct net_rate_estimator { struct gnet_stats_basic_sync *bstats; spinlock_t *stats_lock; bool running; struct gnet_stats_basic_sync __percpu *cpu_bstats; u8 ewma_log; u8 intvl_log; /* period : (250ms << intvl_log) */ seqcount_t seq; u64 last_packets; u64 last_bytes; u64 avpps; u64 avbps; unsigned long next_jiffies; struct timer_list timer; struct rcu_head rcu; }; static void est_fetch_counters(struct net_rate_estimator *e, struct gnet_stats_basic_sync *b) { gnet_stats_basic_sync_init(b); if (e->stats_lock) spin_lock(e->stats_lock); gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running); if (e->stats_lock) spin_unlock(e->stats_lock); } static void est_timer(struct timer_list *t) { struct net_rate_estimator *est = from_timer(est, t, timer); struct gnet_stats_basic_sync b; u64 b_bytes, b_packets; u64 rate, brate; est_fetch_counters(est, &b); b_bytes = u64_stats_read(&b.bytes); b_packets = u64_stats_read(&b.packets); brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log); brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); rate = (b_packets - est->last_packets) << (10 - est->intvl_log); rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); est->avbps += brate; est->avpps += rate; write_seqcount_end(&est->seq); est->last_bytes = b_bytes; est->last_packets = b_packets; est->next_jiffies += ((HZ/4) << est->intvl_log); if (unlikely(time_after_eq(jiffies, est->next_jiffies))) { /* Ouch... timer was delayed. */ est->next_jiffies = jiffies + 1; } mod_timer(&est->timer, est->next_jiffies); } /** * gen_new_estimator - create a new rate estimator * @bstats: basic statistics * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @lock: lock for statistics and control path * @running: true if @bstats represents a running qdisc, thus @bstats' * internal values might change during basic reads. Only used * if @bstats_cpu is NULL * @opt: rate estimator configuration TLV * * Creates a new rate estimator with &bstats as source and &rate_est * as destination. A new timer with the interval specified in the * configuration TLV is created. Upon each interval, the latest statistics * will be read from &bstats and the estimated rate will be stored in * &rate_est with the statistics lock grabbed during this period. * * Returns 0 on success or a negative error code. * */ int gen_new_estimator(struct gnet_stats_basic_sync *bstats, struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, bool running, struct nlattr *opt) { struct gnet_estimator *parm = nla_data(opt); struct net_rate_estimator *old, *est; struct gnet_stats_basic_sync b; int intvl_log; if (nla_len(opt) < sizeof(*parm)) return -EINVAL; /* allowed timer periods are : * -2 : 250ms, -1 : 500ms, 0 : 1 sec * 1 : 2 sec, 2 : 4 sec, 3 : 8 sec */ if (parm->interval < -2 || parm->interval > 3) return -EINVAL; if (parm->ewma_log == 0 || parm->ewma_log >= 31) return -EINVAL; est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) return -ENOBUFS; seqcount_init(&est->seq); intvl_log = parm->interval + 2; est->bstats = bstats; est->stats_lock = lock; est->running = running; est->ewma_log = parm->ewma_log; est->intvl_log = intvl_log; est->cpu_bstats = cpu_bstats; if (lock) local_bh_disable(); est_fetch_counters(est, &b); if (lock) local_bh_enable(); est->last_bytes = u64_stats_read(&b.bytes); est->last_packets = u64_stats_read(&b.packets); if (lock) spin_lock_bh(lock); old = rcu_dereference_protected(*rate_est, 1); if (old) { timer_delete_sync(&old->timer); est->avbps = old->avbps; est->avpps = old->avpps; } est->next_jiffies = jiffies + ((HZ/4) << intvl_log); timer_setup(&est->timer, est_timer, 0); mod_timer(&est->timer, est->next_jiffies); rcu_assign_pointer(*rate_est, est); if (lock) spin_unlock_bh(lock); if (old) kfree_rcu(old, rcu); return 0; } EXPORT_SYMBOL(gen_new_estimator); /** * gen_kill_estimator - remove a rate estimator * @rate_est: rate estimator * * Removes the rate estimator. * */ void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est) { struct net_rate_estimator *est; est = unrcu_pointer(xchg(rate_est, NULL)); if (est) { timer_shutdown_sync(&est->timer); kfree_rcu(est, rcu); } } EXPORT_SYMBOL(gen_kill_estimator); /** * gen_replace_estimator - replace rate estimator configuration * @bstats: basic statistics * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @lock: lock for statistics and control path * @running: true if @bstats represents a running qdisc, thus @bstats' * internal values might change during basic reads. Only used * if @cpu_bstats is NULL * @opt: rate estimator configuration TLV * * Replaces the configuration of a rate estimator by calling * gen_kill_estimator() and gen_new_estimator(). * * Returns 0 on success or a negative error code. */ int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, bool running, struct nlattr *opt) { return gen_new_estimator(bstats, cpu_bstats, rate_est, lock, running, opt); } EXPORT_SYMBOL(gen_replace_estimator); /** * gen_estimator_active - test if estimator is currently in use * @rate_est: rate estimator * * Returns true if estimator is active, and false if not. */ bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est) { return !!rcu_access_pointer(*rate_est); } EXPORT_SYMBOL(gen_estimator_active); bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est, struct gnet_stats_rate_est64 *sample) { struct net_rate_estimator *est; unsigned seq; rcu_read_lock(); est = rcu_dereference(*rate_est); if (!est) { rcu_read_unlock(); return false; } do { seq = read_seqcount_begin(&est->seq); sample->bps = est->avbps >> 8; sample->pps = est->avpps >> 8; } while (read_seqcount_retry(&est->seq, seq)); rcu_read_unlock(); return true; } EXPORT_SYMBOL(gen_estimator_read); |
53 54 140 141 19 19 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA timer back-end using hrtimer * Copyright (C) 2008 Takashi Iwai */ #include <linux/init.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/hrtimer.h> #include <sound/core.h> #include <sound/timer.h> MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>"); MODULE_DESCRIPTION("ALSA hrtimer backend"); MODULE_LICENSE("GPL"); MODULE_ALIAS("snd-timer-" __stringify(SNDRV_TIMER_GLOBAL_HRTIMER)); #define NANO_SEC 1000000000UL /* 10^9 in sec */ static unsigned int resolution; struct snd_hrtimer { struct snd_timer *timer; struct hrtimer hrt; bool in_callback; }; static enum hrtimer_restart snd_hrtimer_callback(struct hrtimer *hrt) { struct snd_hrtimer *stime = container_of(hrt, struct snd_hrtimer, hrt); struct snd_timer *t = stime->timer; ktime_t delta; unsigned long ticks; enum hrtimer_restart ret = HRTIMER_NORESTART; scoped_guard(spinlock, &t->lock) { if (!t->running) return HRTIMER_NORESTART; /* fast path */ stime->in_callback = true; ticks = t->sticks; } /* calculate the drift */ delta = ktime_sub(hrt->base->get_time(), hrtimer_get_expires(hrt)); if (delta > 0) ticks += ktime_divns(delta, ticks * resolution); snd_timer_interrupt(stime->timer, ticks); guard(spinlock)(&t->lock); if (t->running) { hrtimer_add_expires_ns(hrt, t->sticks * resolution); ret = HRTIMER_RESTART; } stime->in_callback = false; return ret; } static int snd_hrtimer_open(struct snd_timer *t) { struct snd_hrtimer *stime; stime = kzalloc(sizeof(*stime), GFP_KERNEL); if (!stime) return -ENOMEM; stime->timer = t; hrtimer_setup(&stime->hrt, snd_hrtimer_callback, CLOCK_MONOTONIC, HRTIMER_MODE_REL); t->private_data = stime; return 0; } static int snd_hrtimer_close(struct snd_timer *t) { struct snd_hrtimer *stime = t->private_data; if (stime) { scoped_guard(spinlock_irq, &t->lock) { t->running = 0; /* just to be sure */ stime->in_callback = 1; /* skip start/stop */ } hrtimer_cancel(&stime->hrt); kfree(stime); t->private_data = NULL; } return 0; } static int snd_hrtimer_start(struct snd_timer *t) { struct snd_hrtimer *stime = t->private_data; if (stime->in_callback) return 0; hrtimer_start(&stime->hrt, ns_to_ktime(t->sticks * resolution), HRTIMER_MODE_REL); return 0; } static int snd_hrtimer_stop(struct snd_timer *t) { struct snd_hrtimer *stime = t->private_data; if (stime->in_callback) return 0; hrtimer_try_to_cancel(&stime->hrt); return 0; } static const struct snd_timer_hardware hrtimer_hw __initconst = { .flags = SNDRV_TIMER_HW_AUTO | SNDRV_TIMER_HW_WORK, .open = snd_hrtimer_open, .close = snd_hrtimer_close, .start = snd_hrtimer_start, .stop = snd_hrtimer_stop, }; /* * entry functions */ static struct snd_timer *mytimer; static int __init snd_hrtimer_init(void) { struct snd_timer *timer; int err; resolution = hrtimer_resolution; /* Create a new timer and set up the fields */ err = snd_timer_global_new("hrtimer", SNDRV_TIMER_GLOBAL_HRTIMER, &timer); if (err < 0) return err; timer->module = THIS_MODULE; strcpy(timer->name, "HR timer"); timer->hw = hrtimer_hw; timer->hw.resolution = resolution; timer->hw.ticks = NANO_SEC / resolution; timer->max_instances = 100; /* lower the limit */ err = snd_timer_global_register(timer); if (err < 0) { snd_timer_global_free(timer); return err; } mytimer = timer; /* remember this */ return 0; } static void __exit snd_hrtimer_exit(void) { if (mytimer) { snd_timer_global_free(mytimer); mytimer = NULL; } } module_init(snd_hrtimer_init); module_exit(snd_hrtimer_exit); |
33 33 33 4695 4691 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | // SPDX-License-Identifier: GPL-2.0 /* * Block rq-qos policy for assigning an I/O priority class to requests. * * Using an rq-qos policy for assigning I/O priority class has two advantages * over using the ioprio_set() system call: * * - This policy is cgroup based so it has all the advantages of cgroups. * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos * controller affects page cache writeback I/O for filesystems that support * assiociating a cgroup with writeback I/O. See also * Documentation/admin-guide/cgroup-v2.rst. */ #include <linux/blk-mq.h> #include <linux/blk_types.h> #include <linux/kernel.h> #include <linux/module.h> #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-rq-qos.h" /** * enum prio_policy - I/O priority class policy. * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class. * @POLICY_PROMOTE_TO_RT: modify no-IOPRIO_CLASS_RT to IOPRIO_CLASS_RT. * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into * IOPRIO_CLASS_BE. * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE. * @POLICY_NONE_TO_RT: an alias for POLICY_PROMOTE_TO_RT. * * See also <linux/ioprio.h>. */ enum prio_policy { POLICY_NO_CHANGE = 0, POLICY_PROMOTE_TO_RT = 1, POLICY_RESTRICT_TO_BE = 2, POLICY_ALL_TO_IDLE = 3, POLICY_NONE_TO_RT = 4, }; static const char *policy_name[] = { [POLICY_NO_CHANGE] = "no-change", [POLICY_PROMOTE_TO_RT] = "promote-to-rt", [POLICY_RESTRICT_TO_BE] = "restrict-to-be", [POLICY_ALL_TO_IDLE] = "idle", [POLICY_NONE_TO_RT] = "none-to-rt", }; static struct blkcg_policy ioprio_policy; /** * struct ioprio_blkcg - Per cgroup data. * @cpd: blkcg_policy_data structure. * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>. */ struct ioprio_blkcg { struct blkcg_policy_data cpd; enum prio_policy prio_policy; }; static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &ioprio_policy), struct ioprio_blkcg, cpd); } static struct ioprio_blkcg * ioprio_blkcg_from_css(struct cgroup_subsys_state *css) { return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); } static int ioprio_show_prio_policy(struct seq_file *sf, void *v) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf)); seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]); return 0; } static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of)); int ret; if (off != 0) return -EIO; /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */ ret = sysfs_match_string(policy_name, buf); if (ret < 0) return ret; blkcg->prio_policy = ret; return nbytes; } static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) { struct ioprio_blkcg *blkcg; blkcg = kzalloc(sizeof(*blkcg), gfp); if (!blkcg) return NULL; blkcg->prio_policy = POLICY_NO_CHANGE; return &blkcg->cpd; } static void ioprio_free_cpd(struct blkcg_policy_data *cpd) { struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd); kfree(blkcg); } static struct cftype ioprio_files[] = { { .name = "prio.class", .seq_show = ioprio_show_prio_policy, .write = ioprio_set_prio_policy, }, { } /* sentinel */ }; static struct blkcg_policy ioprio_policy = { .dfl_cftypes = ioprio_files, .legacy_cftypes = ioprio_files, .cpd_alloc_fn = ioprio_alloc_cpd, .cpd_free_fn = ioprio_free_cpd, }; void blkcg_set_ioprio(struct bio *bio) { struct ioprio_blkcg *blkcg = blkcg_to_ioprio_blkcg(bio->bi_blkg->blkcg); u16 prio; if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) return; if (blkcg->prio_policy == POLICY_PROMOTE_TO_RT || blkcg->prio_policy == POLICY_NONE_TO_RT) { /* * For RT threads, the default priority level is 4 because * task_nice is 0. By promoting non-RT io-priority to RT-class * and default level 4, those requests that are already * RT-class but need a higher io-priority can use ioprio_set() * to achieve this. */ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT) bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4); return; } /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers * correspond to a lower priority. Hence, the max_t() below selects * the lower priority of bi_ioprio and the cgroup I/O priority class. * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O * priority is assigned to the bio. */ prio = max_t(u16, bio->bi_ioprio, IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); if (prio > bio->bi_ioprio) bio->bi_ioprio = prio; } static int __init ioprio_init(void) { return blkcg_policy_register(&ioprio_policy); } static void __exit ioprio_exit(void) { blkcg_policy_unregister(&ioprio_policy); } module_init(ioprio_init); module_exit(ioprio_exit); |
6 90 90 90 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Crypto API wrapper for the generic SHA256 code from lib/crypto/sha256.c * * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com> */ #include <crypto/internal/hash.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <crypto/sha2.h> #include <crypto/sha256_base.h> #include <asm/byteorder.h> #include <linux/unaligned.h> const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE] = { 0xd1, 0x4a, 0x02, 0x8c, 0x2a, 0x3a, 0x2b, 0xc9, 0x47, 0x61, 0x02, 0xbb, 0x28, 0x82, 0x34, 0xc4, 0x15, 0xa2, 0xb0, 0x1f, 0x82, 0x8e, 0xa6, 0x2a, 0xc5, 0xb3, 0xe4, 0x2f }; EXPORT_SYMBOL_GPL(sha224_zero_message_hash); const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE] = { 0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55 }; EXPORT_SYMBOL_GPL(sha256_zero_message_hash); int crypto_sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len) { sha256_update(shash_desc_ctx(desc), data, len); return 0; } EXPORT_SYMBOL(crypto_sha256_update); static int crypto_sha256_final(struct shash_desc *desc, u8 *out) { if (crypto_shash_digestsize(desc->tfm) == SHA224_DIGEST_SIZE) sha224_final(shash_desc_ctx(desc), out); else sha256_final(shash_desc_ctx(desc), out); return 0; } int crypto_sha256_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *hash) { sha256_update(shash_desc_ctx(desc), data, len); return crypto_sha256_final(desc, hash); } EXPORT_SYMBOL(crypto_sha256_finup); static struct shash_alg sha256_algs[2] = { { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_base_init, .update = crypto_sha256_update, .final = crypto_sha256_final, .finup = crypto_sha256_finup, .descsize = sizeof(struct sha256_state), .base = { .cra_name = "sha256", .cra_driver_name= "sha256-generic", .cra_priority = 100, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA224_DIGEST_SIZE, .init = sha224_base_init, .update = crypto_sha256_update, .final = crypto_sha256_final, .finup = crypto_sha256_finup, .descsize = sizeof(struct sha256_state), .base = { .cra_name = "sha224", .cra_driver_name= "sha224-generic", .cra_priority = 100, .cra_blocksize = SHA224_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int __init sha256_generic_mod_init(void) { return crypto_register_shashes(sha256_algs, ARRAY_SIZE(sha256_algs)); } static void __exit sha256_generic_mod_fini(void) { crypto_unregister_shashes(sha256_algs, ARRAY_SIZE(sha256_algs)); } subsys_initcall(sha256_generic_mod_init); module_exit(sha256_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm"); MODULE_ALIAS_CRYPTO("sha224"); MODULE_ALIAS_CRYPTO("sha224-generic"); MODULE_ALIAS_CRYPTO("sha256"); MODULE_ALIAS_CRYPTO("sha256-generic"); |
1082 140 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | /* SPDX-License-Identifier: GPL-2.0 */ /* * This is <linux/capability.h> * * Andrew G. Morgan <morgan@kernel.org> * Alexander Kjeldaas <astor@guardian.no> * with help from Aleph1, Roland Buresund and Andrew Main. * * See here for the libcap library ("POSIX draft" compliance): * * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/ */ #ifndef _LINUX_CAPABILITY_H #define _LINUX_CAPABILITY_H #include <uapi/linux/capability.h> #include <linux/uidgid.h> #include <linux/bits.h> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 extern int file_caps_enabled; typedef struct { u64 val; } kernel_cap_t; /* same as vfs_ns_cap_data but in cpu endian and always filled completely */ struct cpu_vfs_cap_data { __u32 magic_etc; kuid_t rootid; kernel_cap_t permitted; kernel_cap_t inheritable; }; #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) struct file; struct inode; struct dentry; struct task_struct; struct user_namespace; struct mnt_idmap; /* * CAP_FS_MASK and CAP_NFSD_MASKS: * * The fs mask is all the privileges that fsuid==0 historically meant. * At one time in the past, that included CAP_MKNOD and CAP_LINUX_IMMUTABLE. * * It has never meant setting security.* and trusted.* xattrs. * * We could also define fsmask as follows: * 1. CAP_FS_MASK is the privilege to bypass all fs-related DAC permissions * 2. The security.* and trusted.* xattrs are fs-related MAC permissions */ # define CAP_FS_MASK (BIT_ULL(CAP_CHOWN) \ | BIT_ULL(CAP_MKNOD) \ | BIT_ULL(CAP_DAC_OVERRIDE) \ | BIT_ULL(CAP_DAC_READ_SEARCH) \ | BIT_ULL(CAP_FOWNER) \ | BIT_ULL(CAP_FSETID) \ | BIT_ULL(CAP_MAC_OVERRIDE)) #define CAP_VALID_MASK (BIT_ULL(CAP_LAST_CAP+1)-1) # define CAP_EMPTY_SET ((kernel_cap_t) { 0 }) # define CAP_FULL_SET ((kernel_cap_t) { CAP_VALID_MASK }) # define CAP_FS_SET ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_LINUX_IMMUTABLE) }) # define CAP_NFSD_SET ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_SYS_RESOURCE) }) # define cap_clear(c) do { (c).val = 0; } while (0) #define cap_raise(c, flag) ((c).val |= BIT_ULL(flag)) #define cap_lower(c, flag) ((c).val &= ~BIT_ULL(flag)) #define cap_raised(c, flag) (((c).val & BIT_ULL(flag)) != 0) static inline kernel_cap_t cap_combine(const kernel_cap_t a, const kernel_cap_t b) { return (kernel_cap_t) { a.val | b.val }; } static inline kernel_cap_t cap_intersect(const kernel_cap_t a, const kernel_cap_t b) { return (kernel_cap_t) { a.val & b.val }; } static inline kernel_cap_t cap_drop(const kernel_cap_t a, const kernel_cap_t drop) { return (kernel_cap_t) { a.val &~ drop.val }; } static inline bool cap_isclear(const kernel_cap_t a) { return !a.val; } static inline bool cap_isidentical(const kernel_cap_t a, const kernel_cap_t b) { return a.val == b.val; } /* * Check if "a" is a subset of "set". * return true if ALL of the capabilities in "a" are also in "set" * cap_issubset(0101, 1111) will return true * return false if ANY of the capabilities in "a" are not in "set" * cap_issubset(1111, 0101) will return false */ static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set) { return !(a.val & ~set.val); } /* Used to decide between falling back on the old suser() or fsuser(). */ static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a) { return cap_drop(a, CAP_FS_SET); } static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a, const kernel_cap_t permitted) { return cap_combine(a, cap_intersect(permitted, CAP_FS_SET)); } static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a) { return cap_drop(a, CAP_NFSD_SET); } static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, const kernel_cap_t permitted) { return cap_combine(a, cap_intersect(permitted, CAP_NFSD_SET)); } #ifdef CONFIG_MULTIUSER extern bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap); extern bool has_capability_noaudit(struct task_struct *t, int cap); extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); extern bool ns_capable_setid(struct user_namespace *ns, int cap); #else static inline bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool has_capability_noaudit(struct task_struct *t, int cap) { return true; } static inline bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool capable(int cap) { return true; } static inline bool ns_capable(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_setid(struct user_namespace *ns, int cap) { return true; } #endif /* CONFIG_MULTIUSER */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct mnt_idmap *idmap, const struct inode *inode); bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); static inline bool perfmon_capable(void) { return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); } static inline bool bpf_capable(void) { return capable(CAP_BPF) || capable(CAP_SYS_ADMIN); } static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) { return ns_capable(ns, CAP_CHECKPOINT_RESTORE) || ns_capable(ns, CAP_SYS_ADMIN); } /* audit system wants to get cap info from files as well */ int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry, const void **ivalue, size_t size); #endif /* !_LINUX_CAPABILITY_H */ |
56 4 4 4 4 4 379 379 62 62 25 25 25 25 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 | // SPDX-License-Identifier: GPL-2.0-only /* * count the number of connections matching an arbitrary key. * * (C) 2017 Red Hat GmbH * Author: Florian Westphal <fw@strlen.de> * * split from xt_connlimit.c: * (c) 2000 Gerd Knorr <kraxel@bytesex.org> * Nov 2002: Martin Bene <martin.bene@icomedias.com>: * only ignore TIME_WAIT or gone connections * (C) CC Computer Consultants GmbH, 2007 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jhash.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/rbtree.h> #include <linux/module.h> #include <linux/random.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/netfilter/nf_conntrack_tcp.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_count.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> #define CONNCOUNT_SLOTS 256U #define CONNCOUNT_GC_MAX_NODES 8 #define MAX_KEYLEN 5 /* we will save the tuples of all connections we care about */ struct nf_conncount_tuple { struct list_head node; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; int cpu; u32 jiffies32; }; struct nf_conncount_rb { struct rb_node node; struct nf_conncount_list list; u32 key[MAX_KEYLEN]; struct rcu_head rcu_head; }; static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp; struct nf_conncount_data { unsigned int keylen; struct rb_root root[CONNCOUNT_SLOTS]; struct net *net; struct work_struct gc_work; unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)]; unsigned int gc_tree; }; static u_int32_t conncount_rnd __read_mostly; static struct kmem_cache *conncount_rb_cachep __read_mostly; static struct kmem_cache *conncount_conn_cachep __read_mostly; static inline bool already_closed(const struct nf_conn *conn) { if (nf_ct_protonum(conn) == IPPROTO_TCP) return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT || conn->proto.tcp.state == TCP_CONNTRACK_CLOSE; else return false; } static int key_diff(const u32 *a, const u32 *b, unsigned int klen) { return memcmp(a, b, klen * sizeof(u32)); } static void conn_free(struct nf_conncount_list *list, struct nf_conncount_tuple *conn) { lockdep_assert_held(&list->list_lock); list->count--; list_del(&conn->node); kmem_cache_free(conncount_conn_cachep, conn); } static const struct nf_conntrack_tuple_hash * find_or_evict(struct net *net, struct nf_conncount_list *list, struct nf_conncount_tuple *conn) { const struct nf_conntrack_tuple_hash *found; unsigned long a, b; int cpu = raw_smp_processor_id(); u32 age; found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); if (found) return found; b = conn->jiffies32; a = (u32)jiffies; /* conn might have been added just before by another cpu and * might still be unconfirmed. In this case, nf_conntrack_find() * returns no result. Thus only evict if this cpu added the * stale entry or if the entry is older than two jiffies. */ age = a - b; if (conn->cpu == cpu || age >= 2) { conn_free(list, conn); return ERR_PTR(-ENOENT); } return ERR_PTR(-EAGAIN); } static int __nf_conncount_add(struct net *net, struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collect = 0; if ((u32)jiffies == list->last_gc) goto add_new_node; /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { if (collect > CONNCOUNT_GC_MAX_NODES) break; found = find_or_evict(net, list, conn); if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { if (nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) return 0; /* already exists */ } else { collect++; } continue; } found_ct = nf_ct_tuplehash_to_ctrack(found); if (nf_ct_tuple_equal(&conn->tuple, tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * We should not see tuples twice unless someone hooks * this into a table without "-p tcp --syn". * * Attempt to avoid a re-add in this case. */ nf_ct_put(found_ct); return 0; } else if (already_closed(found_ct)) { /* * we do not care about connections which are * closed already -> ditch it */ nf_ct_put(found_ct); conn_free(list, conn); collect++; continue; } nf_ct_put(found_ct); } add_new_node: if (WARN_ON_ONCE(list->count > INT_MAX)) return -EOVERFLOW; conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); if (conn == NULL) return -ENOMEM; conn->tuple = *tuple; conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; list_add_tail(&conn->node, &list->head); list->count++; list->last_gc = (u32)jiffies; return 0; } int nf_conncount_add(struct net *net, struct nf_conncount_list *list, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { int ret; /* check the saved connections */ spin_lock_bh(&list->list_lock); ret = __nf_conncount_add(net, list, tuple, zone); spin_unlock_bh(&list->list_lock); return ret; } EXPORT_SYMBOL_GPL(nf_conncount_add); void nf_conncount_list_init(struct nf_conncount_list *list) { spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); list->count = 0; list->last_gc = (u32)jiffies; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); /* Return true if the list is empty. Must be called with BH disabled. */ bool nf_conncount_gc_list(struct net *net, struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; struct nf_conn *found_ct; unsigned int collected = 0; bool ret = false; /* don't bother if we just did GC */ if ((u32)jiffies == READ_ONCE(list->last_gc)) return false; /* don't bother if other cpu is already doing GC */ if (!spin_trylock(&list->list_lock)) return false; list_for_each_entry_safe(conn, conn_n, &list->head, node) { found = find_or_evict(net, list, conn); if (IS_ERR(found)) { if (PTR_ERR(found) == -ENOENT) collected++; continue; } found_ct = nf_ct_tuplehash_to_ctrack(found); if (already_closed(found_ct)) { /* * we do not care about connections which are * closed already -> ditch it */ nf_ct_put(found_ct); conn_free(list, conn); collected++; continue; } nf_ct_put(found_ct); if (collected > CONNCOUNT_GC_MAX_NODES) break; } if (!list->count) ret = true; list->last_gc = (u32)jiffies; spin_unlock(&list->list_lock); return ret; } EXPORT_SYMBOL_GPL(nf_conncount_gc_list); static void __tree_nodes_free(struct rcu_head *h) { struct nf_conncount_rb *rbconn; rbconn = container_of(h, struct nf_conncount_rb, rcu_head); kmem_cache_free(conncount_rb_cachep, rbconn); } /* caller must hold tree nf_conncount_locks[] lock */ static void tree_nodes_free(struct rb_root *root, struct nf_conncount_rb *gc_nodes[], unsigned int gc_count) { struct nf_conncount_rb *rbconn; while (gc_count) { rbconn = gc_nodes[--gc_count]; spin_lock(&rbconn->list.list_lock); if (!rbconn->list.count) { rb_erase(&rbconn->node, root); call_rcu(&rbconn->rcu_head, __tree_nodes_free); } spin_unlock(&rbconn->list.list_lock); } } static void schedule_gc_worker(struct nf_conncount_data *data, int tree) { set_bit(tree, data->pending_trees); schedule_work(&data->gc_work); } static unsigned int insert_tree(struct net *net, struct nf_conncount_data *data, struct rb_root *root, unsigned int hash, const u32 *key, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; struct rb_node **rbnode, *parent; struct nf_conncount_rb *rbconn; struct nf_conncount_tuple *conn; unsigned int count = 0, gc_count = 0; bool do_gc = true; spin_lock_bh(&nf_conncount_locks[hash]); restart: parent = NULL; rbnode = &(root->rb_node); while (*rbnode) { int diff; rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); parent = *rbnode; diff = key_diff(key, rbconn->key, data->keylen); if (diff < 0) { rbnode = &((*rbnode)->rb_left); } else if (diff > 0) { rbnode = &((*rbnode)->rb_right); } else { int ret; ret = nf_conncount_add(net, &rbconn->list, tuple, zone); if (ret) count = 0; /* hotdrop */ else count = rbconn->list.count; tree_nodes_free(root, gc_nodes, gc_count); goto out_unlock; } if (gc_count >= ARRAY_SIZE(gc_nodes)) continue; if (do_gc && nf_conncount_gc_list(net, &rbconn->list)) gc_nodes[gc_count++] = rbconn; } if (gc_count) { tree_nodes_free(root, gc_nodes, gc_count); schedule_gc_worker(data, hash); gc_count = 0; do_gc = false; goto restart; } /* expected case: match, insert new node */ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); if (rbconn == NULL) goto out_unlock; conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); if (conn == NULL) { kmem_cache_free(conncount_rb_cachep, rbconn); goto out_unlock; } conn->tuple = *tuple; conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; memcpy(rbconn->key, key, sizeof(u32) * data->keylen); nf_conncount_list_init(&rbconn->list); list_add(&conn->node, &rbconn->list.head); count = 1; rbconn->list.count = count; rb_link_node_rcu(&rbconn->node, parent, rbnode); rb_insert_color(&rbconn->node, root); out_unlock: spin_unlock_bh(&nf_conncount_locks[hash]); return count; } static unsigned int count_tree(struct net *net, struct nf_conncount_data *data, const u32 *key, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { struct rb_root *root; struct rb_node *parent; struct nf_conncount_rb *rbconn; unsigned int hash; hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; root = &data->root[hash]; parent = rcu_dereference_raw(root->rb_node); while (parent) { int diff; rbconn = rb_entry(parent, struct nf_conncount_rb, node); diff = key_diff(key, rbconn->key, data->keylen); if (diff < 0) { parent = rcu_dereference_raw(parent->rb_left); } else if (diff > 0) { parent = rcu_dereference_raw(parent->rb_right); } else { int ret; if (!tuple) { nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; } spin_lock_bh(&rbconn->list.list_lock); /* Node might be about to be free'd. * We need to defer to insert_tree() in this case. */ if (rbconn->list.count == 0) { spin_unlock_bh(&rbconn->list.list_lock); break; } /* same source network -> be counted! */ ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); spin_unlock_bh(&rbconn->list.list_lock); if (ret) return 0; /* hotdrop */ else return rbconn->list.count; } } if (!tuple) return 0; return insert_tree(net, data, root, hash, key, tuple, zone); } static void tree_gc_worker(struct work_struct *work) { struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work); struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn; struct rb_root *root; struct rb_node *node; unsigned int tree, next_tree, gc_count = 0; tree = data->gc_tree % CONNCOUNT_SLOTS; root = &data->root[tree]; local_bh_disable(); rcu_read_lock(); for (node = rb_first(root); node != NULL; node = rb_next(node)) { rbconn = rb_entry(node, struct nf_conncount_rb, node); if (nf_conncount_gc_list(data->net, &rbconn->list)) gc_count++; } rcu_read_unlock(); local_bh_enable(); cond_resched(); spin_lock_bh(&nf_conncount_locks[tree]); if (gc_count < ARRAY_SIZE(gc_nodes)) goto next; /* do not bother */ gc_count = 0; node = rb_first(root); while (node != NULL) { rbconn = rb_entry(node, struct nf_conncount_rb, node); node = rb_next(node); if (rbconn->list.count > 0) continue; gc_nodes[gc_count++] = rbconn; if (gc_count >= ARRAY_SIZE(gc_nodes)) { tree_nodes_free(root, gc_nodes, gc_count); gc_count = 0; } } tree_nodes_free(root, gc_nodes, gc_count); next: clear_bit(tree, data->pending_trees); next_tree = (tree + 1) % CONNCOUNT_SLOTS; next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree); if (next_tree < CONNCOUNT_SLOTS) { data->gc_tree = next_tree; schedule_work(work); } spin_unlock_bh(&nf_conncount_locks[tree]); } /* Count and return number of conntrack entries in 'net' with particular 'key'. * If 'tuple' is not null, insert it into the accounting data structure. * Call with RCU read lock. */ unsigned int nf_conncount_count(struct net *net, struct nf_conncount_data *data, const u32 *key, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { return count_tree(net, data, key, tuple, zone); } EXPORT_SYMBOL_GPL(nf_conncount_count); struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen) { struct nf_conncount_data *data; int i; if (keylen % sizeof(u32) || keylen / sizeof(u32) > MAX_KEYLEN || keylen == 0) return ERR_PTR(-EINVAL); net_get_random_once(&conncount_rnd, sizeof(conncount_rnd)); data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); for (i = 0; i < ARRAY_SIZE(data->root); ++i) data->root[i] = RB_ROOT; data->keylen = keylen / sizeof(u32); data->net = net; INIT_WORK(&data->gc_work, tree_gc_worker); return data; } EXPORT_SYMBOL_GPL(nf_conncount_init); void nf_conncount_cache_free(struct nf_conncount_list *list) { struct nf_conncount_tuple *conn, *conn_n; list_for_each_entry_safe(conn, conn_n, &list->head, node) kmem_cache_free(conncount_conn_cachep, conn); } EXPORT_SYMBOL_GPL(nf_conncount_cache_free); static void destroy_tree(struct rb_root *r) { struct nf_conncount_rb *rbconn; struct rb_node *node; while ((node = rb_first(r)) != NULL) { rbconn = rb_entry(node, struct nf_conncount_rb, node); rb_erase(node, r); nf_conncount_cache_free(&rbconn->list); kmem_cache_free(conncount_rb_cachep, rbconn); } } void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data) { unsigned int i; cancel_work_sync(&data->gc_work); for (i = 0; i < ARRAY_SIZE(data->root); ++i) destroy_tree(&data->root[i]); kfree(data); } EXPORT_SYMBOL_GPL(nf_conncount_destroy); static int __init nf_conncount_modinit(void) { int i; for (i = 0; i < CONNCOUNT_SLOTS; ++i) spin_lock_init(&nf_conncount_locks[i]); conncount_conn_cachep = KMEM_CACHE(nf_conncount_tuple, 0); if (!conncount_conn_cachep) return -ENOMEM; conncount_rb_cachep = KMEM_CACHE(nf_conncount_rb, 0); if (!conncount_rb_cachep) { kmem_cache_destroy(conncount_conn_cachep); return -ENOMEM; } return 0; } static void __exit nf_conncount_modexit(void) { kmem_cache_destroy(conncount_conn_cachep); kmem_cache_destroy(conncount_rb_cachep); } module_init(nf_conncount_modinit); module_exit(nf_conncount_modexit); MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_DESCRIPTION("netfilter: count number of connections matching a key"); MODULE_LICENSE("GPL"); |
3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 | #ifndef _NET_FLOW_OFFLOAD_H #define _NET_FLOW_OFFLOAD_H #include <linux/kernel.h> #include <linux/list.h> #include <linux/netlink.h> #include <net/flow_dissector.h> struct flow_match { struct flow_dissector *dissector; void *mask; void *key; }; struct flow_match_meta { struct flow_dissector_key_meta *key, *mask; }; struct flow_match_basic { struct flow_dissector_key_basic *key, *mask; }; struct flow_match_control { struct flow_dissector_key_control *key, *mask; }; struct flow_match_eth_addrs { struct flow_dissector_key_eth_addrs *key, *mask; }; struct flow_match_vlan { struct flow_dissector_key_vlan *key, *mask; }; struct flow_match_arp { struct flow_dissector_key_arp *key, *mask; }; struct flow_match_ipv4_addrs { struct flow_dissector_key_ipv4_addrs *key, *mask; }; struct flow_match_ipv6_addrs { struct flow_dissector_key_ipv6_addrs *key, *mask; }; struct flow_match_ip { struct flow_dissector_key_ip *key, *mask; }; struct flow_match_ports { struct flow_dissector_key_ports *key, *mask; }; struct flow_match_ports_range { struct flow_dissector_key_ports_range *key, *mask; }; struct flow_match_icmp { struct flow_dissector_key_icmp *key, *mask; }; struct flow_match_tcp { struct flow_dissector_key_tcp *key, *mask; }; struct flow_match_ipsec { struct flow_dissector_key_ipsec *key, *mask; }; struct flow_match_mpls { struct flow_dissector_key_mpls *key, *mask; }; struct flow_match_enc_keyid { struct flow_dissector_key_keyid *key, *mask; }; struct flow_match_enc_opts { struct flow_dissector_key_enc_opts *key, *mask; }; struct flow_match_ct { struct flow_dissector_key_ct *key, *mask; }; struct flow_match_pppoe { struct flow_dissector_key_pppoe *key, *mask; }; struct flow_match_l2tpv3 { struct flow_dissector_key_l2tpv3 *key, *mask; }; struct flow_rule; void flow_rule_match_meta(const struct flow_rule *rule, struct flow_match_meta *out); void flow_rule_match_basic(const struct flow_rule *rule, struct flow_match_basic *out); void flow_rule_match_control(const struct flow_rule *rule, struct flow_match_control *out); void flow_rule_match_eth_addrs(const struct flow_rule *rule, struct flow_match_eth_addrs *out); void flow_rule_match_vlan(const struct flow_rule *rule, struct flow_match_vlan *out); void flow_rule_match_cvlan(const struct flow_rule *rule, struct flow_match_vlan *out); void flow_rule_match_arp(const struct flow_rule *rule, struct flow_match_arp *out); void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out); void flow_rule_match_ipv6_addrs(const struct flow_rule *rule, struct flow_match_ipv6_addrs *out); void flow_rule_match_ip(const struct flow_rule *rule, struct flow_match_ip *out); void flow_rule_match_ports(const struct flow_rule *rule, struct flow_match_ports *out); void flow_rule_match_ports_range(const struct flow_rule *rule, struct flow_match_ports_range *out); void flow_rule_match_tcp(const struct flow_rule *rule, struct flow_match_tcp *out); void flow_rule_match_ipsec(const struct flow_rule *rule, struct flow_match_ipsec *out); void flow_rule_match_icmp(const struct flow_rule *rule, struct flow_match_icmp *out); void flow_rule_match_mpls(const struct flow_rule *rule, struct flow_match_mpls *out); void flow_rule_match_enc_control(const struct flow_rule *rule, struct flow_match_control *out); void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out); void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule, struct flow_match_ipv6_addrs *out); void flow_rule_match_enc_ip(const struct flow_rule *rule, struct flow_match_ip *out); void flow_rule_match_enc_ports(const struct flow_rule *rule, struct flow_match_ports *out); void flow_rule_match_enc_keyid(const struct flow_rule *rule, struct flow_match_enc_keyid *out); void flow_rule_match_enc_opts(const struct flow_rule *rule, struct flow_match_enc_opts *out); void flow_rule_match_ct(const struct flow_rule *rule, struct flow_match_ct *out); void flow_rule_match_pppoe(const struct flow_rule *rule, struct flow_match_pppoe *out); void flow_rule_match_l2tpv3(const struct flow_rule *rule, struct flow_match_l2tpv3 *out); enum flow_action_id { FLOW_ACTION_ACCEPT = 0, FLOW_ACTION_DROP, FLOW_ACTION_TRAP, FLOW_ACTION_GOTO, FLOW_ACTION_REDIRECT, FLOW_ACTION_MIRRED, FLOW_ACTION_REDIRECT_INGRESS, FLOW_ACTION_MIRRED_INGRESS, FLOW_ACTION_VLAN_PUSH, FLOW_ACTION_VLAN_POP, FLOW_ACTION_VLAN_MANGLE, FLOW_ACTION_TUNNEL_ENCAP, FLOW_ACTION_TUNNEL_DECAP, FLOW_ACTION_MANGLE, FLOW_ACTION_ADD, FLOW_ACTION_CSUM, FLOW_ACTION_MARK, FLOW_ACTION_PTYPE, FLOW_ACTION_PRIORITY, FLOW_ACTION_RX_QUEUE_MAPPING, FLOW_ACTION_WAKE, FLOW_ACTION_QUEUE, FLOW_ACTION_SAMPLE, FLOW_ACTION_POLICE, FLOW_ACTION_CT, FLOW_ACTION_CT_METADATA, FLOW_ACTION_MPLS_PUSH, FLOW_ACTION_MPLS_POP, FLOW_ACTION_MPLS_MANGLE, FLOW_ACTION_GATE, FLOW_ACTION_PPPOE_PUSH, FLOW_ACTION_JUMP, FLOW_ACTION_PIPE, FLOW_ACTION_VLAN_PUSH_ETH, FLOW_ACTION_VLAN_POP_ETH, FLOW_ACTION_CONTINUE, NUM_FLOW_ACTIONS, }; /* This is mirroring enum pedit_header_type definition for easy mapping between * tc pedit action. Legacy TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK is mapped to * FLOW_ACT_MANGLE_UNSPEC, which is supported by no driver. */ enum flow_action_mangle_base { FLOW_ACT_MANGLE_UNSPEC = 0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, FLOW_ACT_MANGLE_HDR_TYPE_IP4, FLOW_ACT_MANGLE_HDR_TYPE_IP6, FLOW_ACT_MANGLE_HDR_TYPE_TCP, FLOW_ACT_MANGLE_HDR_TYPE_UDP, }; enum flow_action_hw_stats_bit { FLOW_ACTION_HW_STATS_IMMEDIATE_BIT, FLOW_ACTION_HW_STATS_DELAYED_BIT, FLOW_ACTION_HW_STATS_DISABLED_BIT, FLOW_ACTION_HW_STATS_NUM_BITS }; enum flow_action_hw_stats { FLOW_ACTION_HW_STATS_IMMEDIATE = BIT(FLOW_ACTION_HW_STATS_IMMEDIATE_BIT), FLOW_ACTION_HW_STATS_DELAYED = BIT(FLOW_ACTION_HW_STATS_DELAYED_BIT), FLOW_ACTION_HW_STATS_ANY = FLOW_ACTION_HW_STATS_IMMEDIATE | FLOW_ACTION_HW_STATS_DELAYED, FLOW_ACTION_HW_STATS_DISABLED = BIT(FLOW_ACTION_HW_STATS_DISABLED_BIT), FLOW_ACTION_HW_STATS_DONT_CARE = BIT(FLOW_ACTION_HW_STATS_NUM_BITS) - 1, }; typedef void (*action_destr)(void *priv); struct flow_action_cookie { u32 cookie_len; u8 cookie[]; }; struct flow_action_cookie *flow_action_cookie_create(void *data, unsigned int len, gfp_t gfp); void flow_action_cookie_destroy(struct flow_action_cookie *cookie); struct flow_action_entry { enum flow_action_id id; u32 hw_index; unsigned long cookie; u64 miss_cookie; enum flow_action_hw_stats hw_stats; action_destr destructor; void *destructor_priv; union { u32 chain_index; /* FLOW_ACTION_GOTO */ struct net_device *dev; /* FLOW_ACTION_REDIRECT */ struct { /* FLOW_ACTION_VLAN */ u16 vid; __be16 proto; u8 prio; } vlan; struct { /* FLOW_ACTION_VLAN_PUSH_ETH */ unsigned char dst[ETH_ALEN]; unsigned char src[ETH_ALEN]; } vlan_push_eth; struct { /* FLOW_ACTION_MANGLE */ /* FLOW_ACTION_ADD */ enum flow_action_mangle_base htype; u32 offset; u32 mask; u32 val; } mangle; struct ip_tunnel_info *tunnel; /* FLOW_ACTION_TUNNEL_ENCAP */ u32 csum_flags; /* FLOW_ACTION_CSUM */ u32 mark; /* FLOW_ACTION_MARK */ u16 ptype; /* FLOW_ACTION_PTYPE */ u16 rx_queue; /* FLOW_ACTION_RX_QUEUE_MAPPING */ u32 priority; /* FLOW_ACTION_PRIORITY */ struct { /* FLOW_ACTION_QUEUE */ u32 ctx; u32 index; u8 vf; } queue; struct { /* FLOW_ACTION_SAMPLE */ struct psample_group *psample_group; u32 rate; u32 trunc_size; bool truncate; } sample; struct { /* FLOW_ACTION_POLICE */ u32 burst; u64 rate_bytes_ps; u64 peakrate_bytes_ps; u32 avrate; u16 overhead; u64 burst_pkt; u64 rate_pkt_ps; u32 mtu; struct { enum flow_action_id act_id; u32 extval; } exceed, notexceed; } police; struct { /* FLOW_ACTION_CT */ int action; u16 zone; struct nf_flowtable *flow_table; } ct; struct { unsigned long cookie; u32 mark; u32 labels[4]; bool orig_dir; } ct_metadata; struct { /* FLOW_ACTION_MPLS_PUSH */ u32 label; __be16 proto; u8 tc; u8 bos; u8 ttl; } mpls_push; struct { /* FLOW_ACTION_MPLS_POP */ __be16 proto; } mpls_pop; struct { /* FLOW_ACTION_MPLS_MANGLE */ u32 label; u8 tc; u8 bos; u8 ttl; } mpls_mangle; struct { s32 prio; u64 basetime; u64 cycletime; u64 cycletimeext; u32 num_entries; struct action_gate_entry *entries; } gate; struct { /* FLOW_ACTION_PPPOE_PUSH */ u16 sid; } pppoe; }; struct flow_action_cookie *user_cookie; /* user defined action cookie */ }; struct flow_action { unsigned int num_entries; struct flow_action_entry entries[] __counted_by(num_entries); }; static inline bool flow_action_has_entries(const struct flow_action *action) { return action->num_entries; } /** * flow_offload_has_one_action() - check if exactly one action is present * @action: tc filter flow offload action * * Return: true if exactly one action is present. */ static inline bool flow_offload_has_one_action(const struct flow_action *action) { return action->num_entries == 1; } static inline bool flow_action_is_last_entry(const struct flow_action *action, const struct flow_action_entry *entry) { return entry == &action->entries[action->num_entries - 1]; } #define flow_action_for_each(__i, __act, __actions) \ for (__i = 0, __act = &(__actions)->entries[0]; \ __i < (__actions)->num_entries; \ __act = &(__actions)->entries[++__i]) static inline bool flow_action_mixed_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack) { const struct flow_action_entry *action_entry; u8 last_hw_stats; int i; if (flow_offload_has_one_action(action)) return true; flow_action_for_each(i, action_entry, action) { if (i && action_entry->hw_stats != last_hw_stats) { NL_SET_ERR_MSG_MOD(extack, "Mixing HW stats types for actions is not supported"); return false; } last_hw_stats = action_entry->hw_stats; } return true; } static inline const struct flow_action_entry * flow_action_first_entry_get(const struct flow_action *action) { WARN_ON(!flow_action_has_entries(action)); return &action->entries[0]; } static inline bool __flow_action_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack, bool check_allow_bit, enum flow_action_hw_stats_bit allow_bit) { const struct flow_action_entry *action_entry; if (!flow_action_has_entries(action)) return true; if (!flow_action_mixed_hw_stats_check(action, extack)) return false; action_entry = flow_action_first_entry_get(action); /* Zero is not a legal value for hw_stats, catch anyone passing it */ WARN_ON_ONCE(!action_entry->hw_stats); if (!check_allow_bit && ~action_entry->hw_stats & FLOW_ACTION_HW_STATS_ANY) { NL_SET_ERR_MSG_MOD(extack, "Driver supports only default HW stats type \"any\""); return false; } else if (check_allow_bit && !(action_entry->hw_stats & BIT(allow_bit))) { NL_SET_ERR_MSG_MOD(extack, "Driver does not support selected HW stats type"); return false; } return true; } static inline bool flow_action_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack, enum flow_action_hw_stats_bit allow_bit) { return __flow_action_hw_stats_check(action, extack, true, allow_bit); } static inline bool flow_action_basic_hw_stats_check(const struct flow_action *action, struct netlink_ext_ack *extack) { return __flow_action_hw_stats_check(action, extack, false, 0); } struct flow_rule { struct flow_match match; struct flow_action action; }; struct flow_rule *flow_rule_alloc(unsigned int num_actions); static inline bool flow_rule_match_key(const struct flow_rule *rule, enum flow_dissector_key_id key) { return dissector_uses_key(rule->match.dissector, key); } /** * flow_rule_is_supp_control_flags() - check for supported control flags * @supp_flags: control flags supported by driver * @ctrl_flags: control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if only supported control flags are set, false otherwise. */ static inline bool flow_rule_is_supp_control_flags(const u32 supp_flags, const u32 ctrl_flags, struct netlink_ext_ack *extack) { if (likely((ctrl_flags & ~supp_flags) == 0)) return true; NL_SET_ERR_MSG_FMT_MOD(extack, "Unsupported match on control.flags %#x", ctrl_flags); return false; } /** * flow_rule_is_supp_enc_control_flags() - check for supported control flags * @supp_enc_flags: encapsulation control flags supported by driver * @enc_ctrl_flags: encapsulation control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if only supported control flags are set, false otherwise. */ static inline bool flow_rule_is_supp_enc_control_flags(const u32 supp_enc_flags, const u32 enc_ctrl_flags, struct netlink_ext_ack *extack) { if (likely((enc_ctrl_flags & ~supp_enc_flags) == 0)) return true; NL_SET_ERR_MSG_FMT_MOD(extack, "Unsupported match on enc_control.flags %#x", enc_ctrl_flags); return false; } /** * flow_rule_has_control_flags() - check for presence of any control flags * @ctrl_flags: control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if control flags are set, false otherwise. */ static inline bool flow_rule_has_control_flags(const u32 ctrl_flags, struct netlink_ext_ack *extack) { return !flow_rule_is_supp_control_flags(0, ctrl_flags, extack); } /** * flow_rule_has_enc_control_flags() - check for presence of any control flags * @enc_ctrl_flags: encapsulation control flags present in rule * @extack: The netlink extended ACK for reporting errors. * * Return: true if control flags are set, false otherwise. */ static inline bool flow_rule_has_enc_control_flags(const u32 enc_ctrl_flags, struct netlink_ext_ack *extack) { return !flow_rule_is_supp_enc_control_flags(0, enc_ctrl_flags, extack); } /** * flow_rule_match_has_control_flags() - match and check for any control flags * @rule: The flow_rule under evaluation. * @extack: The netlink extended ACK for reporting errors. * * Return: true if control flags are set, false otherwise. */ static inline bool flow_rule_match_has_control_flags(struct flow_rule *rule, struct netlink_ext_ack *extack) { struct flow_match_control match; if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) return false; flow_rule_match_control(rule, &match); return flow_rule_has_control_flags(match.mask->flags, extack); } struct flow_stats { u64 pkts; u64 bytes; u64 drops; u64 lastused; enum flow_action_hw_stats used_hw_stats; bool used_hw_stats_valid; }; static inline void flow_stats_update(struct flow_stats *flow_stats, u64 bytes, u64 pkts, u64 drops, u64 lastused, enum flow_action_hw_stats used_hw_stats) { flow_stats->pkts += pkts; flow_stats->bytes += bytes; flow_stats->drops += drops; flow_stats->lastused = max_t(u64, flow_stats->lastused, lastused); /* The driver should pass value with a maximum of one bit set. * Passing FLOW_ACTION_HW_STATS_ANY is invalid. */ WARN_ON(used_hw_stats == FLOW_ACTION_HW_STATS_ANY); flow_stats->used_hw_stats |= used_hw_stats; flow_stats->used_hw_stats_valid = true; } enum flow_block_command { FLOW_BLOCK_BIND, FLOW_BLOCK_UNBIND, }; enum flow_block_binder_type { FLOW_BLOCK_BINDER_TYPE_UNSPEC, FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS, FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, FLOW_BLOCK_BINDER_TYPE_RED_EARLY_DROP, FLOW_BLOCK_BINDER_TYPE_RED_MARK, }; struct flow_block { struct list_head cb_list; }; struct netlink_ext_ack; struct flow_block_offload { enum flow_block_command command; enum flow_block_binder_type binder_type; bool block_shared; bool unlocked_driver_cb; struct net *net; struct flow_block *block; struct list_head cb_list; struct list_head *driver_block_list; struct netlink_ext_ack *extack; struct Qdisc *sch; struct list_head *cb_list_head; }; enum tc_setup_type; typedef int flow_setup_cb_t(enum tc_setup_type type, void *type_data, void *cb_priv); struct flow_block_cb; struct flow_block_indr { struct list_head list; struct net_device *dev; struct Qdisc *sch; enum flow_block_binder_type binder_type; void *data; void *cb_priv; void (*cleanup)(struct flow_block_cb *block_cb); }; struct flow_block_cb { struct list_head driver_list; struct list_head list; flow_setup_cb_t *cb; void *cb_ident; void *cb_priv; void (*release)(void *cb_priv); struct flow_block_indr indr; unsigned int refcnt; }; struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)); struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv), struct flow_block_offload *bo, struct net_device *dev, struct Qdisc *sch, void *data, void *indr_cb_priv, void (*cleanup)(struct flow_block_cb *block_cb)); void flow_block_cb_free(struct flow_block_cb *block_cb); struct flow_block_cb *flow_block_cb_lookup(struct flow_block *block, flow_setup_cb_t *cb, void *cb_ident); void *flow_block_cb_priv(struct flow_block_cb *block_cb); void flow_block_cb_incref(struct flow_block_cb *block_cb); unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb); static inline void flow_block_cb_add(struct flow_block_cb *block_cb, struct flow_block_offload *offload) { list_add_tail(&block_cb->list, &offload->cb_list); } static inline void flow_block_cb_remove(struct flow_block_cb *block_cb, struct flow_block_offload *offload) { list_move(&block_cb->list, &offload->cb_list); } static inline void flow_indr_block_cb_remove(struct flow_block_cb *block_cb, struct flow_block_offload *offload) { list_del(&block_cb->indr.list); list_move(&block_cb->list, &offload->cb_list); } bool flow_block_cb_is_busy(flow_setup_cb_t *cb, void *cb_ident, struct list_head *driver_block_list); int flow_block_cb_setup_simple(struct flow_block_offload *f, struct list_head *driver_list, flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, bool ingress_only); enum flow_cls_command { FLOW_CLS_REPLACE, FLOW_CLS_DESTROY, FLOW_CLS_STATS, FLOW_CLS_TMPLT_CREATE, FLOW_CLS_TMPLT_DESTROY, }; struct flow_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; bool skip_sw; struct netlink_ext_ack *extack; }; struct flow_cls_offload { struct flow_cls_common_offload common; enum flow_cls_command command; bool use_act_stats; unsigned long cookie; struct flow_rule *rule; struct flow_stats stats; u32 classid; }; enum offload_act_command { FLOW_ACT_REPLACE, FLOW_ACT_DESTROY, FLOW_ACT_STATS, }; struct flow_offload_action { struct netlink_ext_ack *extack; /* NULL in FLOW_ACT_STATS process*/ enum offload_act_command command; enum flow_action_id id; u32 index; unsigned long cookie; struct flow_stats stats; struct flow_action action; }; struct flow_offload_action *offload_action_alloc(unsigned int num_actions); static inline struct flow_rule * flow_cls_offload_flow_rule(struct flow_cls_offload *flow_cmd) { return flow_cmd->rule; } static inline void flow_block_init(struct flow_block *flow_block) { INIT_LIST_HEAD(&flow_block->cb_list); } typedef int flow_indr_block_bind_cb_t(struct net_device *dev, struct Qdisc *sch, void *cb_priv, enum tc_setup_type type, void *type_data, void *data, void (*cleanup)(struct flow_block_cb *block_cb)); int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv); void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, void (*release)(void *cb_priv)); int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void *data, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)); bool flow_indr_dev_exists(void); #endif /* _NET_FLOW_OFFLOAD_H */ |
3 6 6 6 1 2 3 6 6 2 1 1 1 8 1 1 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | // SPDX-License-Identifier: GPL-2.0-only /* * vsock sock_diag(7) module * * Copyright (C) 2017 Red Hat, Inc. * Author: Stefan Hajnoczi <stefanha@redhat.com> */ #include <linux/module.h> #include <linux/sock_diag.h> #include <linux/vm_sockets_diag.h> #include <net/af_vsock.h> static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, u32 portid, u32 seq, u32 flags) { struct vsock_sock *vsk = vsock_sk(sk); struct vsock_diag_msg *rep; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), flags); if (!nlh) return -EMSGSIZE; rep = nlmsg_data(nlh); rep->vdiag_family = AF_VSOCK; /* Lock order dictates that sk_lock is acquired before * vsock_table_lock, so we cannot lock here. Simply don't take * sk_lock; sk is guaranteed to stay alive since vsock_table_lock is * held. */ rep->vdiag_type = sk->sk_type; rep->vdiag_state = sk->sk_state; rep->vdiag_shutdown = sk->sk_shutdown; rep->vdiag_src_cid = vsk->local_addr.svm_cid; rep->vdiag_src_port = vsk->local_addr.svm_port; rep->vdiag_dst_cid = vsk->remote_addr.svm_cid; rep->vdiag_dst_port = vsk->remote_addr.svm_port; rep->vdiag_ino = sock_i_ino(sk); sock_diag_save_cookie(sk, rep->vdiag_cookie); return 0; } static int vsock_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct vsock_diag_req *req; struct vsock_sock *vsk; unsigned int bucket; unsigned int last_i; unsigned int table; struct net *net; unsigned int i; req = nlmsg_data(cb->nlh); net = sock_net(skb->sk); /* State saved between calls: */ table = cb->args[0]; bucket = cb->args[1]; i = last_i = cb->args[2]; /* TODO VMCI pending sockets? */ spin_lock_bh(&vsock_table_lock); /* Bind table (locally created sockets) */ if (table == 0) { while (bucket < ARRAY_SIZE(vsock_bind_table)) { struct list_head *head = &vsock_bind_table[bucket]; i = 0; list_for_each_entry(vsk, head, bound_table) { struct sock *sk = sk_vsock(vsk); if (!net_eq(sock_net(sk), net)) continue; if (i < last_i) goto next_bind; if (!(req->vdiag_states & (1 << sk->sk_state))) goto next_bind; if (sk_diag_fill(sk, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) goto done; next_bind: i++; } last_i = 0; bucket++; } table++; bucket = 0; } /* Connected table (accepted connections) */ while (bucket < ARRAY_SIZE(vsock_connected_table)) { struct list_head *head = &vsock_connected_table[bucket]; i = 0; list_for_each_entry(vsk, head, connected_table) { struct sock *sk = sk_vsock(vsk); /* Skip sockets we've already seen above */ if (__vsock_in_bound_table(vsk)) continue; if (!net_eq(sock_net(sk), net)) continue; if (i < last_i) goto next_connected; if (!(req->vdiag_states & (1 << sk->sk_state))) goto next_connected; if (sk_diag_fill(sk, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) goto done; next_connected: i++; } last_i = 0; bucket++; } done: spin_unlock_bh(&vsock_table_lock); cb->args[0] = table; cb->args[1] = bucket; cb->args[2] = i; return skb->len; } static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct vsock_diag_req); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = vsock_diag_dump, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } return -EOPNOTSUPP; } static const struct sock_diag_handler vsock_diag_handler = { .owner = THIS_MODULE, .family = AF_VSOCK, .dump = vsock_diag_handler_dump, }; static int __init vsock_diag_init(void) { return sock_diag_register(&vsock_diag_handler); } static void __exit vsock_diag_exit(void) { sock_diag_unregister(&vsock_diag_handler); } module_init(vsock_diag_init); module_exit(vsock_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("VMware Virtual Sockets monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 40 /* AF_VSOCK */); |
469 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 | // SPDX-License-Identifier: GPL-2.0+ /* * linux/net/sunrpc/gss_rpc_upcall.c * * Copyright (C) 2012 Simo Sorce <simo@redhat.com> */ #include <linux/types.h> #include <linux/un.h> #include <linux/sunrpc/svcauth.h> #include "gss_rpc_upcall.h" #define GSSPROXY_SOCK_PATHNAME "/var/run/gssproxy.sock" #define GSSPROXY_PROGRAM (400112u) #define GSSPROXY_VERS_1 (1u) /* * Encoding/Decoding functions */ enum { GSSX_NULL = 0, /* Unused */ GSSX_INDICATE_MECHS = 1, GSSX_GET_CALL_CONTEXT = 2, GSSX_IMPORT_AND_CANON_NAME = 3, GSSX_EXPORT_CRED = 4, GSSX_IMPORT_CRED = 5, GSSX_ACQUIRE_CRED = 6, GSSX_STORE_CRED = 7, GSSX_INIT_SEC_CONTEXT = 8, GSSX_ACCEPT_SEC_CONTEXT = 9, GSSX_RELEASE_HANDLE = 10, GSSX_GET_MIC = 11, GSSX_VERIFY = 12, GSSX_WRAP = 13, GSSX_UNWRAP = 14, GSSX_WRAP_SIZE_LIMIT = 15, }; #define PROC(proc, name) \ [GSSX_##proc] = { \ .p_proc = GSSX_##proc, \ .p_encode = gssx_enc_##name, \ .p_decode = gssx_dec_##name, \ .p_arglen = GSSX_ARG_##name##_sz, \ .p_replen = GSSX_RES_##name##_sz, \ .p_statidx = GSSX_##proc, \ .p_name = #proc, \ } static const struct rpc_procinfo gssp_procedures[] = { PROC(INDICATE_MECHS, indicate_mechs), PROC(GET_CALL_CONTEXT, get_call_context), PROC(IMPORT_AND_CANON_NAME, import_and_canon_name), PROC(EXPORT_CRED, export_cred), PROC(IMPORT_CRED, import_cred), PROC(ACQUIRE_CRED, acquire_cred), PROC(STORE_CRED, store_cred), PROC(INIT_SEC_CONTEXT, init_sec_context), PROC(ACCEPT_SEC_CONTEXT, accept_sec_context), PROC(RELEASE_HANDLE, release_handle), PROC(GET_MIC, get_mic), PROC(VERIFY, verify), PROC(WRAP, wrap), PROC(UNWRAP, unwrap), PROC(WRAP_SIZE_LIMIT, wrap_size_limit), }; /* * Common transport functions */ static const struct rpc_program gssp_program; static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt) { static const struct sockaddr_un gssp_localaddr = { .sun_family = AF_LOCAL, .sun_path = GSSPROXY_SOCK_PATHNAME, }; struct rpc_create_args args = { .net = net, .protocol = XPRT_TRANSPORT_LOCAL, .address = (struct sockaddr *)&gssp_localaddr, .addrsize = sizeof(gssp_localaddr), .servername = "localhost", .program = &gssp_program, .version = GSSPROXY_VERS_1, .authflavor = RPC_AUTH_NULL, /* * Note we want connection to be done in the caller's * filesystem namespace. We therefore turn off the idle * timeout, which would result in reconnections being * done without the correct namespace: */ .flags = RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_CONNECTED | RPC_CLNT_CREATE_NO_IDLE_TIMEOUT }; struct rpc_clnt *clnt; int result = 0; clnt = rpc_create(&args); if (IS_ERR(clnt)) { dprintk("RPC: failed to create AF_LOCAL gssproxy " "client (errno %ld).\n", PTR_ERR(clnt)); result = PTR_ERR(clnt); *_clnt = NULL; goto out; } dprintk("RPC: created new gssp local client (gssp_local_clnt: " "%p)\n", clnt); *_clnt = clnt; out: return result; } void init_gssp_clnt(struct sunrpc_net *sn) { mutex_init(&sn->gssp_lock); sn->gssp_clnt = NULL; } int set_gssp_clnt(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_clnt *clnt; int ret; mutex_lock(&sn->gssp_lock); ret = gssp_rpc_create(net, &clnt); if (!ret) { if (sn->gssp_clnt) rpc_shutdown_client(sn->gssp_clnt); sn->gssp_clnt = clnt; } mutex_unlock(&sn->gssp_lock); return ret; } void clear_gssp_clnt(struct sunrpc_net *sn) { mutex_lock(&sn->gssp_lock); if (sn->gssp_clnt) { rpc_shutdown_client(sn->gssp_clnt); sn->gssp_clnt = NULL; } mutex_unlock(&sn->gssp_lock); } static struct rpc_clnt *get_gssp_clnt(struct sunrpc_net *sn) { struct rpc_clnt *clnt; mutex_lock(&sn->gssp_lock); clnt = sn->gssp_clnt; if (clnt) refcount_inc(&clnt->cl_count); mutex_unlock(&sn->gssp_lock); return clnt; } static int gssp_call(struct net *net, struct rpc_message *msg) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_clnt *clnt; int status; clnt = get_gssp_clnt(sn); if (!clnt) return -EIO; status = rpc_call_sync(clnt, msg, 0); if (status < 0) { dprintk("gssp: rpc_call returned error %d\n", -status); switch (status) { case -EPROTONOSUPPORT: status = -EINVAL; break; case -ECONNREFUSED: case -ETIMEDOUT: case -ENOTCONN: status = -EAGAIN; break; case -ERESTARTSYS: if (signalled ()) status = -EINTR; break; default: break; } } rpc_release_client(clnt); return status; } static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) { unsigned int i; for (i = 0; i < arg->npages && arg->pages[i]; i++) __free_page(arg->pages[i]); kfree(arg->pages); } static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) { unsigned int i; arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE); arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL); if (!arg->pages) return -ENOMEM; for (i = 0; i < arg->npages; i++) { arg->pages[i] = alloc_page(GFP_KERNEL); if (!arg->pages[i]) { gssp_free_receive_pages(arg); return -ENOMEM; } } return 0; } static char *gssp_stringify(struct xdr_netobj *netobj) { return kmemdup_nul(netobj->data, netobj->len, GFP_KERNEL); } static void gssp_hostbased_service(char **principal) { char *c; if (!*principal) return; /* terminate and remove realm part */ c = strchr(*principal, '@'); if (c) { *c = '\0'; /* change service-hostname delimiter */ c = strchr(*principal, '/'); if (c) *c = '@'; } if (!c) { /* not a service principal */ kfree(*principal); *principal = NULL; } } /* * Public functions */ /* numbers somewhat arbitrary but large enough for current needs */ #define GSSX_MAX_OUT_HANDLE 128 #define GSSX_MAX_SRC_PRINC 256 #define GSSX_KMEMBUF (GSSX_max_output_handle_sz + \ GSSX_max_oid_sz + \ GSSX_max_princ_sz + \ sizeof(struct svc_cred)) int gssp_accept_sec_context_upcall(struct net *net, struct gssp_upcall_data *data) { struct gssx_ctx ctxh = { .state = data->in_handle }; struct gssx_arg_accept_sec_context arg = { .input_token = data->in_token, }; struct gssx_ctx rctxh = { /* * pass in the max length we expect for each of these * buffers but let the xdr code kmalloc them: */ .exported_context_token.len = GSSX_max_output_handle_sz, .mech.len = GSS_OID_MAX_LEN, .targ_name.display_name.len = GSSX_max_princ_sz, .src_name.display_name.len = GSSX_max_princ_sz }; struct gssx_res_accept_sec_context res = { .context_handle = &rctxh, .output_token = &data->out_token }; struct rpc_message msg = { .rpc_proc = &gssp_procedures[GSSX_ACCEPT_SEC_CONTEXT], .rpc_argp = &arg, .rpc_resp = &res, .rpc_cred = NULL, /* FIXME ? */ }; struct xdr_netobj client_name = { 0 , NULL }; struct xdr_netobj target_name = { 0, NULL }; int ret; if (data->in_handle.len != 0) arg.context_handle = &ctxh; res.output_token->len = GSSX_max_output_token_sz; ret = gssp_alloc_receive_pages(&arg); if (ret) return ret; ret = gssp_call(net, &msg); gssp_free_receive_pages(&arg); /* we need to fetch all data even in case of error so * that we can free special strctures is they have been allocated */ data->major_status = res.status.major_status; data->minor_status = res.status.minor_status; if (res.context_handle) { data->out_handle = rctxh.exported_context_token; data->mech_oid.len = rctxh.mech.len; if (rctxh.mech.data) { memcpy(data->mech_oid.data, rctxh.mech.data, data->mech_oid.len); kfree(rctxh.mech.data); } client_name = rctxh.src_name.display_name; target_name = rctxh.targ_name.display_name; } if (res.options.count == 1) { gssx_buffer *value = &res.options.data[0].value; /* Currently we only decode CREDS_VALUE, if we add * anything else we'll have to loop and match on the * option name */ if (value->len == 1) { /* steal group info from struct svc_cred */ data->creds = *(struct svc_cred *)value->data; data->found_creds = 1; } /* whether we use it or not, free data */ kfree(value->data); } if (res.options.count != 0) { kfree(res.options.data); } /* convert to GSS_NT_HOSTBASED_SERVICE form and set into creds */ if (data->found_creds) { if (client_name.data) { data->creds.cr_raw_principal = gssp_stringify(&client_name); data->creds.cr_principal = gssp_stringify(&client_name); gssp_hostbased_service(&data->creds.cr_principal); } if (target_name.data) { data->creds.cr_targ_princ = gssp_stringify(&target_name); gssp_hostbased_service(&data->creds.cr_targ_princ); } } kfree(client_name.data); kfree(target_name.data); return ret; } void gssp_free_upcall_data(struct gssp_upcall_data *data) { kfree(data->in_handle.data); kfree(data->out_handle.data); kfree(data->out_token.data); free_svc_cred(&data->creds); } /* * Initialization stuff */ static unsigned int gssp_version1_counts[ARRAY_SIZE(gssp_procedures)]; static const struct rpc_version gssp_version1 = { .number = GSSPROXY_VERS_1, .nrprocs = ARRAY_SIZE(gssp_procedures), .procs = gssp_procedures, .counts = gssp_version1_counts, }; static const struct rpc_version *gssp_version[] = { NULL, &gssp_version1, }; static struct rpc_stat gssp_stats; static const struct rpc_program gssp_program = { .name = "gssproxy", .number = GSSPROXY_PROGRAM, .nrvers = ARRAY_SIZE(gssp_version), .version = gssp_version, .stats = &gssp_stats, }; |
6 6 6 18 18 18 18 18 18 31 31 31 3 28 3 31 14 62 53 2 62 62 4 56 53 2 55 55 9 9 9 6 6 6 4069 4002 124 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2007-2012 Siemens AG * * Written by: * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Sergey Lapin <slapin@ossfans.org> * Maxim Gorbachyov <maxim.gorbachev@siemens.com> * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #include <linux/netdevice.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/ieee802154.h> #include <net/nl802154.h> #include <net/mac802154.h> #include <net/ieee802154_netdev.h> #include <net/cfg802154.h> #include "ieee802154_i.h" #include "driver-ops.h" int mac802154_wpan_update_llsec(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; int rc = 0; if (ops->llsec) { struct ieee802154_llsec_params params; int changed = 0; params.pan_id = wpan_dev->pan_id; changed |= IEEE802154_LLSEC_PARAM_PAN_ID; params.hwaddr = wpan_dev->extended_addr; changed |= IEEE802154_LLSEC_PARAM_HWADDR; rc = ops->llsec->set_params(dev, ¶ms, changed); } return rc; } static int mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct sockaddr_ieee802154 *sa = (struct sockaddr_ieee802154 *)&ifr->ifr_addr; int err = -ENOIOCTLCMD; if (cmd != SIOCGIFADDR && cmd != SIOCSIFADDR) return err; rtnl_lock(); switch (cmd) { case SIOCGIFADDR: { u16 pan_id, short_addr; pan_id = le16_to_cpu(wpan_dev->pan_id); short_addr = le16_to_cpu(wpan_dev->short_addr); if (pan_id == IEEE802154_PANID_BROADCAST || short_addr == IEEE802154_ADDR_BROADCAST) { err = -EADDRNOTAVAIL; break; } sa->family = AF_IEEE802154; sa->addr.addr_type = IEEE802154_ADDR_SHORT; sa->addr.pan_id = pan_id; sa->addr.short_addr = short_addr; err = 0; break; } case SIOCSIFADDR: if (netif_running(dev)) { rtnl_unlock(); return -EBUSY; } dev_warn(&dev->dev, "Using DEBUGing ioctl SIOCSIFADDR isn't recommended!\n"); if (sa->family != AF_IEEE802154 || sa->addr.addr_type != IEEE802154_ADDR_SHORT || sa->addr.pan_id == IEEE802154_PANID_BROADCAST || sa->addr.short_addr == IEEE802154_ADDR_BROADCAST || sa->addr.short_addr == IEEE802154_ADDR_UNDEF) { err = -EINVAL; break; } wpan_dev->pan_id = cpu_to_le16(sa->addr.pan_id); wpan_dev->short_addr = cpu_to_le16(sa->addr.short_addr); err = mac802154_wpan_update_llsec(dev); break; } rtnl_unlock(); return err; } static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct sockaddr *addr = p; __le64 extended_addr; if (netif_running(dev)) return -EBUSY; /* lowpan need to be down for update * SLAAC address after ifup */ if (sdata->wpan_dev.lowpan_dev) { if (netif_running(sdata->wpan_dev.lowpan_dev)) return -EBUSY; } ieee802154_be64_to_le64(&extended_addr, addr->sa_data); if (!ieee802154_is_valid_extended_unicast_addr(extended_addr)) return -EINVAL; dev_addr_set(dev, addr->sa_data); sdata->wpan_dev.extended_addr = extended_addr; /* update lowpan interface mac address when * wpan mac has been changed */ if (sdata->wpan_dev.lowpan_dev) dev_addr_set(sdata->wpan_dev.lowpan_dev, dev->dev_addr); return mac802154_wpan_update_llsec(dev); } static int ieee802154_setup_hw(struct ieee802154_sub_if_data *sdata) { struct ieee802154_local *local = sdata->local; struct wpan_dev *wpan_dev = &sdata->wpan_dev; int ret; sdata->required_filtering = sdata->iface_default_filtering; if (local->hw.flags & IEEE802154_HW_AFILT) { local->addr_filt.pan_id = wpan_dev->pan_id; local->addr_filt.ieee_addr = wpan_dev->extended_addr; local->addr_filt.short_addr = wpan_dev->short_addr; } if (local->hw.flags & IEEE802154_HW_LBT) { ret = drv_set_lbt_mode(local, wpan_dev->lbt); if (ret < 0) return ret; } if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { ret = drv_set_csma_params(local, wpan_dev->min_be, wpan_dev->max_be, wpan_dev->csma_retries); if (ret < 0) return ret; } if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { ret = drv_set_max_frame_retries(local, wpan_dev->frame_retries); if (ret < 0) return ret; } return 0; } static int mac802154_slave_open(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; int res; ASSERT_RTNL(); set_bit(SDATA_STATE_RUNNING, &sdata->state); if (!local->open_count) { res = ieee802154_setup_hw(sdata); if (res) goto err; res = drv_start(local, sdata->required_filtering, &local->addr_filt); if (res) goto err; } local->open_count++; netif_start_queue(dev); return 0; err: /* might already be clear but that doesn't matter */ clear_bit(SDATA_STATE_RUNNING, &sdata->state); return res; } static int ieee802154_check_mac_settings(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct ieee802154_sub_if_data *nsdata) { struct wpan_dev *nwpan_dev = &nsdata->wpan_dev; struct wpan_dev *wpan_dev = &sdata->wpan_dev; ASSERT_RTNL(); if (sdata->iface_default_filtering != nsdata->iface_default_filtering) return -EBUSY; if (local->hw.flags & IEEE802154_HW_AFILT) { if (wpan_dev->pan_id != nwpan_dev->pan_id || wpan_dev->short_addr != nwpan_dev->short_addr || wpan_dev->extended_addr != nwpan_dev->extended_addr) return -EBUSY; } if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { if (wpan_dev->min_be != nwpan_dev->min_be || wpan_dev->max_be != nwpan_dev->max_be || wpan_dev->csma_retries != nwpan_dev->csma_retries) return -EBUSY; } if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { if (wpan_dev->frame_retries != nwpan_dev->frame_retries) return -EBUSY; } if (local->hw.flags & IEEE802154_HW_LBT) { if (wpan_dev->lbt != nwpan_dev->lbt) return -EBUSY; } return 0; } static int ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata, enum nl802154_iftype iftype) { struct ieee802154_local *local = sdata->local; struct ieee802154_sub_if_data *nsdata; /* we hold the RTNL here so can safely walk the list */ list_for_each_entry(nsdata, &local->interfaces, list) { if (nsdata != sdata && ieee802154_sdata_running(nsdata)) { int ret; /* TODO currently we don't support multiple node/coord * types we need to run skb_clone at rx path. Check if * there exist really an use case if we need to support * multiple node/coord types at the same time. */ if (sdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR && nsdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR) return -EBUSY; /* check all phy mac sublayer settings are the same. * We have only one phy, different values makes trouble. */ ret = ieee802154_check_mac_settings(local, sdata, nsdata); if (ret < 0) return ret; } } return 0; } static int mac802154_wpan_open(struct net_device *dev) { int rc; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; rc = ieee802154_check_concurrent_iface(sdata, wpan_dev->iftype); if (rc < 0) return rc; return mac802154_slave_open(dev); } static int mac802154_slave_close(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; ASSERT_RTNL(); if (mac802154_is_scanning(local)) mac802154_abort_scan_locked(local, sdata); if (mac802154_is_beaconing(local)) mac802154_stop_beacons_locked(local, sdata); netif_stop_queue(dev); local->open_count--; clear_bit(SDATA_STATE_RUNNING, &sdata->state); if (!local->open_count) ieee802154_stop_device(local); return 0; } static int mac802154_set_header_security(struct ieee802154_sub_if_data *sdata, struct ieee802154_hdr *hdr, const struct ieee802154_mac_cb *cb) { struct ieee802154_llsec_params params; u8 level; mac802154_llsec_get_params(&sdata->sec, ¶ms); if (!params.enabled && cb->secen_override && cb->secen) return -EINVAL; if (!params.enabled || (cb->secen_override && !cb->secen) || !params.out_level) return 0; if (cb->seclevel_override && !cb->seclevel) return -EINVAL; level = cb->seclevel_override ? cb->seclevel : params.out_level; hdr->fc.security_enabled = 1; hdr->sec.level = level; hdr->sec.key_id_mode = params.out_key.mode; if (params.out_key.mode == IEEE802154_SCF_KEY_SHORT_INDEX) hdr->sec.short_src = params.out_key.short_source; else if (params.out_key.mode == IEEE802154_SCF_KEY_HW_INDEX) hdr->sec.extended_src = params.out_key.extended_source; hdr->sec.key_id = params.out_key.id; return 0; } static int ieee802154_header_create(struct sk_buff *skb, struct net_device *dev, const struct ieee802154_addr *daddr, const struct ieee802154_addr *saddr, unsigned len) { struct ieee802154_hdr hdr; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct ieee802154_mac_cb *cb = mac_cb(skb); int hlen; if (!daddr) return -EINVAL; memset(&hdr.fc, 0, sizeof(hdr.fc)); hdr.fc.type = cb->type; hdr.fc.security_enabled = cb->secen; hdr.fc.ack_request = cb->ackreq; hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; if (mac802154_set_header_security(sdata, &hdr, cb) < 0) return -EINVAL; if (!saddr) { if (wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST) || wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) || wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) { hdr.source.mode = IEEE802154_ADDR_LONG; hdr.source.extended_addr = wpan_dev->extended_addr; } else { hdr.source.mode = IEEE802154_ADDR_SHORT; hdr.source.short_addr = wpan_dev->short_addr; } hdr.source.pan_id = wpan_dev->pan_id; } else { hdr.source = *(const struct ieee802154_addr *)saddr; } hdr.dest = *(const struct ieee802154_addr *)daddr; hlen = ieee802154_hdr_push(skb, &hdr); if (hlen < 0) return -EINVAL; skb_reset_mac_header(skb); skb->mac_len = hlen; if (len > ieee802154_max_payload(&hdr)) return -EMSGSIZE; return hlen; } static const struct wpan_dev_header_ops ieee802154_header_ops = { .create = ieee802154_header_create, }; /* This header create functionality assumes a 8 byte array for * source and destination pointer at maximum. To adapt this for * the 802.15.4 dataframe header we use extended address handling * here only and intra pan connection. fc fields are mostly fallback * handling. For provide dev_hard_header for dgram sockets. */ static int mac802154_header_create(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { struct ieee802154_hdr hdr; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct ieee802154_mac_cb cb = { }; int hlen; if (!daddr) return -EINVAL; memset(&hdr.fc, 0, sizeof(hdr.fc)); hdr.fc.type = IEEE802154_FC_TYPE_DATA; hdr.fc.ack_request = wpan_dev->ackreq; hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; /* TODO currently a workaround to give zero cb block to set * security parameters defaults according MIB. */ if (mac802154_set_header_security(sdata, &hdr, &cb) < 0) return -EINVAL; hdr.dest.pan_id = wpan_dev->pan_id; hdr.dest.mode = IEEE802154_ADDR_LONG; ieee802154_be64_to_le64(&hdr.dest.extended_addr, daddr); hdr.source.pan_id = hdr.dest.pan_id; hdr.source.mode = IEEE802154_ADDR_LONG; if (!saddr) hdr.source.extended_addr = wpan_dev->extended_addr; else ieee802154_be64_to_le64(&hdr.source.extended_addr, saddr); hlen = ieee802154_hdr_push(skb, &hdr); if (hlen < 0) return -EINVAL; skb_reset_mac_header(skb); skb->mac_len = hlen; if (len > ieee802154_max_payload(&hdr)) return -EMSGSIZE; return hlen; } static int mac802154_header_parse(const struct sk_buff *skb, unsigned char *haddr) { struct ieee802154_hdr hdr; if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) { pr_debug("malformed packet\n"); return 0; } if (hdr.source.mode == IEEE802154_ADDR_LONG) { ieee802154_le64_to_be64(haddr, &hdr.source.extended_addr); return IEEE802154_EXTENDED_ADDR_LEN; } return 0; } static const struct header_ops mac802154_header_ops = { .create = mac802154_header_create, .parse = mac802154_header_parse, }; static const struct net_device_ops mac802154_wpan_ops = { .ndo_open = mac802154_wpan_open, .ndo_stop = mac802154_slave_close, .ndo_start_xmit = ieee802154_subif_start_xmit, .ndo_do_ioctl = mac802154_wpan_ioctl, .ndo_set_mac_address = mac802154_wpan_mac_addr, }; static const struct net_device_ops mac802154_monitor_ops = { .ndo_open = mac802154_wpan_open, .ndo_stop = mac802154_slave_close, .ndo_start_xmit = ieee802154_monitor_start_xmit, }; static void mac802154_wpan_free(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); mac802154_llsec_destroy(&sdata->sec); } static void ieee802154_if_setup(struct net_device *dev) { dev->addr_len = IEEE802154_EXTENDED_ADDR_LEN; memset(dev->broadcast, 0xff, IEEE802154_EXTENDED_ADDR_LEN); /* Let hard_header_len set to IEEE802154_MIN_HEADER_LEN. AF_PACKET * will not send frames without any payload, but ack frames * has no payload, so substract one that we can send a 3 bytes * frame. The xmit callback assumes at least a hard header where two * bytes fc and sequence field are set. */ dev->hard_header_len = IEEE802154_MIN_HEADER_LEN - 1; /* The auth_tag header is for security and places in private payload * room of mac frame which stucks between payload and FCS field. */ dev->needed_tailroom = IEEE802154_MAX_AUTH_TAG_LEN + IEEE802154_FCS_LEN; /* The mtu size is the payload without mac header in this case. * We have a dynamic length header with a minimum header length * which is hard_header_len. In this case we let mtu to the size * of maximum payload which is IEEE802154_MTU - IEEE802154_FCS_LEN - * hard_header_len. The FCS which is set by hardware or ndo_start_xmit * and the minimum mac header which can be evaluated inside driver * layer. The rest of mac header will be part of payload if greater * than hard_header_len. */ dev->mtu = IEEE802154_MTU - IEEE802154_FCS_LEN - dev->hard_header_len; dev->tx_queue_len = 300; dev->flags = IFF_NOARP | IFF_BROADCAST; } static int ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, enum nl802154_iftype type) { struct wpan_dev *wpan_dev = &sdata->wpan_dev; int ret; u8 tmp; /* set some type-dependent values */ sdata->wpan_dev.iftype = type; get_random_bytes(&tmp, sizeof(tmp)); atomic_set(&wpan_dev->bsn, tmp); get_random_bytes(&tmp, sizeof(tmp)); atomic_set(&wpan_dev->dsn, tmp); /* defaults per 802.15.4-2011 */ wpan_dev->min_be = 3; wpan_dev->max_be = 5; wpan_dev->csma_retries = 4; wpan_dev->frame_retries = 3; wpan_dev->pan_id = cpu_to_le16(IEEE802154_PANID_BROADCAST); wpan_dev->short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); switch (type) { case NL802154_IFTYPE_COORD: case NL802154_IFTYPE_NODE: ieee802154_be64_to_le64(&wpan_dev->extended_addr, sdata->dev->dev_addr); sdata->dev->header_ops = &mac802154_header_ops; sdata->dev->needs_free_netdev = true; sdata->dev->priv_destructor = mac802154_wpan_free; sdata->dev->netdev_ops = &mac802154_wpan_ops; sdata->dev->ml_priv = &mac802154_mlme_wpan; sdata->iface_default_filtering = IEEE802154_FILTERING_4_FRAME_FIELDS; wpan_dev->header_ops = &ieee802154_header_ops; mutex_init(&sdata->sec_mtx); mac802154_llsec_init(&sdata->sec); ret = mac802154_wpan_update_llsec(sdata->dev); if (ret < 0) return ret; break; case NL802154_IFTYPE_MONITOR: sdata->dev->needs_free_netdev = true; sdata->dev->netdev_ops = &mac802154_monitor_ops; sdata->iface_default_filtering = IEEE802154_FILTERING_NONE; break; default: BUG(); } return 0; } struct net_device * ieee802154_if_add(struct ieee802154_local *local, const char *name, unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { u8 addr[IEEE802154_EXTENDED_ADDR_LEN]; struct net_device *ndev = NULL; struct ieee802154_sub_if_data *sdata = NULL; int ret; ASSERT_RTNL(); ndev = alloc_netdev(sizeof(*sdata), name, name_assign_type, ieee802154_if_setup); if (!ndev) return ERR_PTR(-ENOMEM); ndev->needed_headroom = local->hw.extra_tx_headroom + IEEE802154_MAX_HEADER_LEN; ret = dev_alloc_name(ndev, ndev->name); if (ret < 0) goto err; ieee802154_le64_to_be64(ndev->perm_addr, &local->hw.phy->perm_extended_addr); switch (type) { case NL802154_IFTYPE_COORD: case NL802154_IFTYPE_NODE: ndev->type = ARPHRD_IEEE802154; if (ieee802154_is_valid_extended_unicast_addr(extended_addr)) { ieee802154_le64_to_be64(addr, &extended_addr); dev_addr_set(ndev, addr); } else { dev_addr_set(ndev, ndev->perm_addr); } break; case NL802154_IFTYPE_MONITOR: ndev->type = ARPHRD_IEEE802154_MONITOR; break; default: ret = -EINVAL; goto err; } /* TODO check this */ SET_NETDEV_DEV(ndev, &local->phy->dev); dev_net_set(ndev, wpan_phy_net(local->hw.phy)); sdata = netdev_priv(ndev); ndev->ieee802154_ptr = &sdata->wpan_dev; memcpy(sdata->name, ndev->name, IFNAMSIZ); sdata->dev = ndev; sdata->wpan_dev.wpan_phy = local->hw.phy; sdata->local = local; INIT_LIST_HEAD(&sdata->wpan_dev.list); /* setup type-dependent data */ ret = ieee802154_setup_sdata(sdata, type); if (ret) goto err; ret = register_netdevice(ndev); if (ret < 0) goto err; mutex_lock(&local->iflist_mtx); list_add_tail_rcu(&sdata->list, &local->interfaces); mutex_unlock(&local->iflist_mtx); return ndev; err: free_netdev(ndev); return ERR_PTR(ret); } void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata) { ASSERT_RTNL(); mutex_lock(&sdata->local->iflist_mtx); if (list_empty(&sdata->local->interfaces)) { mutex_unlock(&sdata->local->iflist_mtx); return; } list_del_rcu(&sdata->list); mutex_unlock(&sdata->local->iflist_mtx); synchronize_rcu(); unregister_netdevice(sdata->dev); } void ieee802154_remove_interfaces(struct ieee802154_local *local) { struct ieee802154_sub_if_data *sdata, *tmp; mutex_lock(&local->iflist_mtx); list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) { list_del(&sdata->list); unregister_netdevice(sdata->dev); } mutex_unlock(&local->iflist_mtx); } static int netdev_notify(struct notifier_block *nb, unsigned long state, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ieee802154_sub_if_data *sdata; if (state != NETDEV_CHANGENAME) return NOTIFY_DONE; if (!dev->ieee802154_ptr || !dev->ieee802154_ptr->wpan_phy) return NOTIFY_DONE; if (dev->ieee802154_ptr->wpan_phy->privid != mac802154_wpan_phy_privid) return NOTIFY_DONE; sdata = IEEE802154_DEV_TO_SUB_IF(dev); memcpy(sdata->name, dev->name, IFNAMSIZ); return NOTIFY_OK; } static struct notifier_block mac802154_netdev_notifier = { .notifier_call = netdev_notify, }; int ieee802154_iface_init(void) { return register_netdevice_notifier(&mac802154_netdev_notifier); } void ieee802154_iface_exit(void) { unregister_netdevice_notifier(&mac802154_netdev_notifier); } |
1531 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_TASK_STACK_H #define _LINUX_SCHED_TASK_STACK_H /* * task->stack (kernel stack) handling interfaces: */ #include <linux/sched.h> #include <linux/magic.h> #include <linux/refcount.h> #include <linux/kasan.h> #ifdef CONFIG_THREAD_INFO_IN_TASK /* * When accessing the stack of a non-current task that might exit, use * try_get_task_stack() instead. task_stack_page will return a pointer * that could get freed out from under you. */ static __always_inline void *task_stack_page(const struct task_struct *task) { return task->stack; } #define setup_thread_stack(new,old) do { } while(0) static __always_inline unsigned long *end_of_stack(const struct task_struct *task) { #ifdef CONFIG_STACK_GROWSUP return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1; #else return task->stack; #endif } #else #define task_stack_page(task) ((void *)(task)->stack) static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) { *task_thread_info(p) = *task_thread_info(org); task_thread_info(p)->task = p; } /* * Return the address of the last usable long on the stack. * * When the stack grows down, this is just above the thread * info struct. Going any lower will corrupt the threadinfo. * * When the stack grows up, this is the highest address. * Beyond that position, we corrupt data on the next page. */ static inline unsigned long *end_of_stack(struct task_struct *p) { #ifdef CONFIG_STACK_GROWSUP return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1; #else return (unsigned long *)(task_thread_info(p) + 1); #endif } #endif #ifdef CONFIG_THREAD_INFO_IN_TASK static inline void *try_get_task_stack(struct task_struct *tsk) { return refcount_inc_not_zero(&tsk->stack_refcount) ? task_stack_page(tsk) : NULL; } extern void put_task_stack(struct task_struct *tsk); #else static inline void *try_get_task_stack(struct task_struct *tsk) { return task_stack_page(tsk); } static inline void put_task_stack(struct task_struct *tsk) {} #endif void exit_task_stack_account(struct task_struct *tsk); #define task_stack_end_corrupted(task) \ (*(end_of_stack(task)) != STACK_END_MAGIC) static inline int object_is_on_stack(const void *obj) { void *stack = task_stack_page(current); obj = kasan_reset_tag(obj); return (obj >= stack) && (obj < (stack + THREAD_SIZE)); } extern void thread_stack_cache_init(void); #ifdef CONFIG_DEBUG_STACK_USAGE unsigned long stack_not_used(struct task_struct *p); #else static inline unsigned long stack_not_used(struct task_struct *p) { return 0; } #endif extern void set_task_stack_end_magic(struct task_struct *tsk); #ifndef __HAVE_ARCH_KSTACK_END static inline int kstack_end(void *addr) { /* Reliable end of stack detection: * Some APM bios versions misalign the stack */ return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); } #endif #endif /* _LINUX_SCHED_TASK_STACK_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 1999-2002 Vojtech Pavlik */ #ifndef _SERIO_H #define _SERIO_H #include <linux/cleanup.h> #include <linux/types.h> #include <linux/interrupt.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/device.h> #include <linux/mod_devicetable.h> #include <uapi/linux/serio.h> extern const struct bus_type serio_bus; struct serio { void *port_data; char name[32]; char phys[32]; char firmware_id[128]; bool manual_bind; struct serio_device_id id; /* Protects critical sections from port's interrupt handler */ spinlock_t lock; int (*write)(struct serio *, unsigned char); int (*open)(struct serio *); void (*close)(struct serio *); int (*start)(struct serio *); void (*stop)(struct serio *); struct serio *parent; /* Entry in parent->children list */ struct list_head child_node; struct list_head children; /* Level of nesting in serio hierarchy */ unsigned int depth; /* * serio->drv is accessed from interrupt handlers; when modifying * caller should acquire serio->drv_mutex and serio->lock. */ struct serio_driver *drv; /* Protects serio->drv so attributes can pin current driver */ struct mutex drv_mutex; struct device dev; struct list_head node; /* * For use by PS/2 layer when several ports share hardware and * may get indigestion when exposed to concurrent access (i8042). */ struct mutex *ps2_cmd_mutex; }; #define to_serio_port(d) container_of(d, struct serio, dev) struct serio_driver { const char *description; const struct serio_device_id *id_table; bool manual_bind; void (*write_wakeup)(struct serio *); irqreturn_t (*interrupt)(struct serio *, unsigned char, unsigned int); int (*connect)(struct serio *, struct serio_driver *drv); int (*reconnect)(struct serio *); int (*fast_reconnect)(struct serio *); void (*disconnect)(struct serio *); void (*cleanup)(struct serio *); struct device_driver driver; }; #define to_serio_driver(d) container_of_const(d, struct serio_driver, driver) int serio_open(struct serio *serio, struct serio_driver *drv); void serio_close(struct serio *serio); void serio_rescan(struct serio *serio); void serio_reconnect(struct serio *serio); irqreturn_t serio_interrupt(struct serio *serio, unsigned char data, unsigned int flags); void __serio_register_port(struct serio *serio, struct module *owner); /* use a define to avoid include chaining to get THIS_MODULE */ #define serio_register_port(serio) \ __serio_register_port(serio, THIS_MODULE) void serio_unregister_port(struct serio *serio); void serio_unregister_child_port(struct serio *serio); int __must_check __serio_register_driver(struct serio_driver *drv, struct module *owner, const char *mod_name); /* use a define to avoid include chaining to get THIS_MODULE & friends */ #define serio_register_driver(drv) \ __serio_register_driver(drv, THIS_MODULE, KBUILD_MODNAME) void serio_unregister_driver(struct serio_driver *drv); /** * module_serio_driver() - Helper macro for registering a serio driver * @__serio_driver: serio_driver struct * * Helper macro for serio drivers which do not do anything special in * module init/exit. This eliminates a lot of boilerplate. Each module * may only use this macro once, and calling it replaces module_init() * and module_exit(). */ #define module_serio_driver(__serio_driver) \ module_driver(__serio_driver, serio_register_driver, \ serio_unregister_driver) static inline int serio_write(struct serio *serio, unsigned char data) { if (serio->write) return serio->write(serio, data); else return -1; } static inline void serio_drv_write_wakeup(struct serio *serio) { if (serio->drv && serio->drv->write_wakeup) serio->drv->write_wakeup(serio); } /* * Use the following functions to manipulate serio's per-port * driver-specific data. */ static inline void *serio_get_drvdata(struct serio *serio) { return dev_get_drvdata(&serio->dev); } static inline void serio_set_drvdata(struct serio *serio, void *data) { dev_set_drvdata(&serio->dev, data); } /* * Use the following functions to protect critical sections in * driver code from port's interrupt handler */ static inline void serio_pause_rx(struct serio *serio) { spin_lock_irq(&serio->lock); } static inline void serio_continue_rx(struct serio *serio) { spin_unlock_irq(&serio->lock); } DEFINE_GUARD(serio_pause_rx, struct serio *, serio_pause_rx(_T), serio_continue_rx(_T)) #endif |
82 165 204 2 3 8 8 6 5 5 231 13 5 27 165 135 32 9 8 138 83 145 64 275 275 275 12 6 5 274 267 111 275 6 6 4 6 90 8 196 104 87 111 77 187 169 17 11 8 17 187 76 197 1 82 28 115 33 275 85 150 82 245 196 1 133 132 9 125 117 7 7 266 11 6 138 138 138 134 7 25 45 5 4 5 4 3 6 2 3 5 2 4 24 2 16 4 6 7 1 28 120 29 1 51 39 78 35 93 34 59 63 17 72 8 41 39 78 2 77 3 75 5 74 6 77 3 77 3 78 2 66 14 71 9 80 93 118 1 77 40 119 119 1 83 68 83 1 5 79 83 74 5 5 4 6 10 83 1 1 1 1 1 4 5 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 | // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_netem.c Network emulator * * Many of the algorithms and ideas for this came from * NIST Net which is not copyrighted. * * Authors: Stephen Hemminger <shemminger@osdl.org> * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> */ #include <linux/mm.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/prandom.h> #include <linux/rtnetlink.h> #include <linux/reciprocal_div.h> #include <linux/rbtree.h> #include <net/gso.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> #define VERSION "1.3" /* Network Emulation Queuing algorithm. ==================================== Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based Network Emulation Tool [2] Luigi Rizzo, DummyNet for FreeBSD ---------------------------------------------------------------- This started out as a simple way to delay outgoing packets to test TCP but has grown to include most of the functionality of a full blown network emulator like NISTnet. It can delay packets and add random jitter (and correlation). The random distribution can be loaded from a table as well to provide normal, Pareto, or experimental curves. Packet loss, duplication, and reordering can also be emulated. This qdisc does not do classification that can be handled in layering other disciplines. It does not need to do bandwidth control either since that can be handled by using token bucket or other rate control. Correlated Loss Generator models Added generation of correlated loss according to the "Gilbert-Elliot" model, a 4-state markov model. References: [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general and intuitive loss model for packet networks and its implementation in the Netem module in the Linux kernel", available in [1] Authors: Stefano Salsano <stefano.salsano at uniroma2.it Fabio Ludovici <fabio.ludovici at yahoo.it> */ struct disttable { u32 size; s16 table[] __counted_by(size); }; struct netem_sched_data { /* internal t(ime)fifo qdisc uses t_root and sch->limit */ struct rb_root t_root; /* a linear queue; reduces rbtree rebalancing when jitter is low */ struct sk_buff *t_head; struct sk_buff *t_tail; u32 t_len; /* optional qdisc for classful handling (NULL at netem init) */ struct Qdisc *qdisc; struct qdisc_watchdog watchdog; s64 latency; s64 jitter; u32 loss; u32 ecn; u32 limit; u32 counter; u32 gap; u32 duplicate; u32 reorder; u32 corrupt; u64 rate; s32 packet_overhead; u32 cell_size; struct reciprocal_value cell_size_reciprocal; s32 cell_overhead; struct crndstate { u32 last; u32 rho; } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; struct prng { u64 seed; struct rnd_state prng_state; } prng; struct disttable *delay_dist; enum { CLG_RANDOM, CLG_4_STATES, CLG_GILB_ELL, } loss_model; enum { TX_IN_GAP_PERIOD = 1, TX_IN_BURST_PERIOD, LOST_IN_GAP_PERIOD, LOST_IN_BURST_PERIOD, } _4_state_model; enum { GOOD_STATE = 1, BAD_STATE, } GE_state_model; /* Correlated Loss Generation models */ struct clgstate { /* state of the Markov chain */ u8 state; /* 4-states and Gilbert-Elliot models */ u32 a1; /* p13 for 4-states or p for GE */ u32 a2; /* p31 for 4-states or r for GE */ u32 a3; /* p32 for 4-states or h for GE */ u32 a4; /* p14 for 4-states or 1-k for GE */ u32 a5; /* p23 used only in 4-states */ } clg; struct tc_netem_slot slot_config; struct slotstate { u64 slot_next; s32 packets_left; s32 bytes_left; } slot; struct disttable *slot_dist; }; /* Time stamp put into socket buffer control block * Only valid when skbs are in our internal t(ime)fifo queue. * * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp, * and skb->next & skb->prev are scratch space for a qdisc, * we save skb->tstamp value in skb->cb[] before destroying it. */ struct netem_skb_cb { u64 time_to_send; }; static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { /* we assume we can use skb next/prev/tstamp as storage for rb_node */ qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; } /* init_crandom - initialize correlated random number generator * Use entropy source for initial seed. */ static void init_crandom(struct crndstate *state, unsigned long rho) { state->rho = rho; state->last = get_random_u32(); } /* get_crandom - correlated random number generator * Next number depends on last value. * rho is scaled to avoid floating point. */ static u32 get_crandom(struct crndstate *state, struct prng *p) { u64 value, rho; unsigned long answer; struct rnd_state *s = &p->prng_state; if (!state || state->rho == 0) /* no correlation */ return prandom_u32_state(s); value = prandom_u32_state(s); rho = (u64)state->rho + 1; answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; state->last = answer; return answer; } /* loss_4state - 4-state model loss generator * Generates losses according to the 4-state Markov chain adopted in * the GI (General and Intuitive) loss model. */ static bool loss_4state(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; u32 rnd = prandom_u32_state(&q->prng.prng_state); /* * Makes a comparison between rnd and the transition * probabilities outgoing from the current state, then decides the * next state and if the next packet has to be transmitted or lost. * The four states correspond to: * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period * LOST_IN_GAP_PERIOD => isolated losses within a gap period * LOST_IN_BURST_PERIOD => lost packets within a burst period * TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period */ switch (clg->state) { case TX_IN_GAP_PERIOD: if (rnd < clg->a4) { clg->state = LOST_IN_GAP_PERIOD; return true; } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { clg->state = LOST_IN_BURST_PERIOD; return true; } else if (clg->a1 + clg->a4 < rnd) { clg->state = TX_IN_GAP_PERIOD; } break; case TX_IN_BURST_PERIOD: if (rnd < clg->a5) { clg->state = LOST_IN_BURST_PERIOD; return true; } else { clg->state = TX_IN_BURST_PERIOD; } break; case LOST_IN_BURST_PERIOD: if (rnd < clg->a3) clg->state = TX_IN_BURST_PERIOD; else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { clg->state = TX_IN_GAP_PERIOD; } else if (clg->a2 + clg->a3 < rnd) { clg->state = LOST_IN_BURST_PERIOD; return true; } break; case LOST_IN_GAP_PERIOD: clg->state = TX_IN_GAP_PERIOD; break; } return false; } /* loss_gilb_ell - Gilbert-Elliot model loss generator * Generates losses according to the Gilbert-Elliot loss model or * its special cases (Gilbert or Simple Gilbert) * * Makes a comparison between random number and the transition * probabilities outgoing from the current state, then decides the * next state. A second random number is extracted and the comparison * with the loss probability of the current state decides if the next * packet will be transmitted or lost. */ static bool loss_gilb_ell(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; struct rnd_state *s = &q->prng.prng_state; switch (clg->state) { case GOOD_STATE: if (prandom_u32_state(s) < clg->a1) clg->state = BAD_STATE; if (prandom_u32_state(s) < clg->a4) return true; break; case BAD_STATE: if (prandom_u32_state(s) < clg->a2) clg->state = GOOD_STATE; if (prandom_u32_state(s) > clg->a3) return true; } return false; } static bool loss_event(struct netem_sched_data *q) { switch (q->loss_model) { case CLG_RANDOM: /* Random packet drop 0 => none, ~0 => all */ return q->loss && q->loss >= get_crandom(&q->loss_cor, &q->prng); case CLG_4_STATES: /* 4state loss model algorithm (used also for GI model) * Extracts a value from the markov 4 state loss generator, * if it is 1 drops a packet and if needed writes the event in * the kernel logs */ return loss_4state(q); case CLG_GILB_ELL: /* Gilbert-Elliot loss model algorithm * Extracts a value from the Gilbert-Elliot loss generator, * if it is 1 drops a packet and if needed writes the event in * the kernel logs */ return loss_gilb_ell(q); } return false; /* not reached */ } /* tabledist - return a pseudo-randomly distributed value with mean mu and * std deviation sigma. Uses table lookup to approximate the desired * distribution, and a uniformly-distributed pseudo-random source. */ static s64 tabledist(s64 mu, s32 sigma, struct crndstate *state, struct prng *prng, const struct disttable *dist) { s64 x; long t; u32 rnd; if (sigma == 0) return mu; rnd = get_crandom(state, prng); /* default uniform distribution */ if (dist == NULL) return ((rnd % (2 * (u32)sigma)) + mu) - sigma; t = dist->table[rnd % dist->size]; x = (sigma % NETEM_DIST_SCALE) * t; if (x >= 0) x += NETEM_DIST_SCALE/2; else x -= NETEM_DIST_SCALE/2; return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } static u64 packet_time_ns(u64 len, const struct netem_sched_data *q) { len += q->packet_overhead; if (q->cell_size) { u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); if (len > cells * q->cell_size) /* extra cell needed for remainder */ cells++; len = cells * (q->cell_size + q->cell_overhead); } return div64_u64(len * NSEC_PER_SEC, q->rate); } static void tfifo_reset(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct rb_node *p = rb_first(&q->t_root); while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } rtnl_kfree_skbs(q->t_head, q->t_tail); q->t_head = NULL; q->t_tail = NULL; q->t_len = 0; } static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); u64 tnext = netem_skb_cb(nskb)->time_to_send; if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) { if (q->t_tail) q->t_tail->next = nskb; else q->t_head = nskb; q->t_tail = nskb; } else { struct rb_node **p = &q->t_root.rb_node, *parent = NULL; while (*p) { struct sk_buff *skb; parent = *p; skb = rb_to_skb(parent); if (tnext >= netem_skb_cb(skb)->time_to_send) p = &parent->rb_right; else p = &parent->rb_left; } rb_link_node(&nskb->rbnode, parent, p); rb_insert_color(&nskb->rbnode, &q->t_root); } q->t_len++; sch->q.qlen++; } /* netem can't properly corrupt a megapacket (like we get from GSO), so instead * when we statistically choose to corrupt one, we instead segment it, returning * the first packet to be corrupted, and re-enqueue the remaining frames */ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct sk_buff *segs; netdev_features_t features = netif_skb_features(skb); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { qdisc_drop(skb, sch, to_free); return NULL; } consume_skb(skb); return segs; } /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. * NET_XMIT_DROP: queue length didn't change. * NET_XMIT_SUCCESS: one skb was queued. */ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct netem_sched_data *q = qdisc_priv(sch); /* We don't fill cb now as skb_unshare() may invalidate it */ struct netem_skb_cb *cb; struct sk_buff *skb2 = NULL; struct sk_buff *segs = NULL; unsigned int prev_len = qdisc_pkt_len(skb); int count = 1; /* Do not fool qdisc_drop_all() */ skb->prev = NULL; /* Random duplication */ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) ++count; /* Drop packet? */ if (loss_event(q)) { if (q->ecn && INET_ECN_set_ce(skb)) qdisc_qstats_drop(sch); /* mark packet */ else --count; } if (count == 0) { qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } /* If a delay is expected, orphan the skb. (orphaning usually takes * place at TX completion time, so _before_ the link transit delay) */ if (q->latency || q->jitter || q->rate) skb_orphan_partial(skb); /* * If we need to duplicate packet, then clone it before * original is modified. */ if (count > 1) skb2 = skb_clone(skb, GFP_ATOMIC); /* * Randomized packet corruption. * Make copy if needed since we are modifying * If packet is going to be hardware checksummed, then * do it now in software before we mangle it. */ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor, &q->prng)) { if (skb_is_gso(skb)) { skb = netem_segment(skb, sch, to_free); if (!skb) goto finish_segs; segs = skb->next; skb_mark_not_on_list(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; } skb = skb_unshare(skb, GFP_ATOMIC); if (unlikely(!skb)) { qdisc_qstats_drop(sch); goto finish_segs; } if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) { qdisc_drop(skb, sch, to_free); skb = NULL; goto finish_segs; } skb->data[get_random_u32_below(skb_headlen(skb))] ^= 1<<get_random_u32_below(8); } if (unlikely(q->t_len >= sch->limit)) { /* re-link segs, so that qdisc_drop_all() frees them all */ skb->next = segs; qdisc_drop_all(skb, sch, to_free); if (skb2) __qdisc_drop(skb2, to_free); return NET_XMIT_DROP; } /* * If doing duplication then re-insert at top of the * qdisc tree, since parent queuer expects that only one * skb will be queued. */ if (skb2) { struct Qdisc *rootq = qdisc_root_bh(sch); u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ q->duplicate = 0; rootq->enqueue(skb2, rootq, to_free); q->duplicate = dupsave; skb2 = NULL; } qdisc_qstats_backlog_inc(sch, skb); cb = netem_skb_cb(skb); if (q->gap == 0 || /* not doing reordering */ q->counter < q->gap - 1 || /* inside last reordering gap */ q->reorder < get_crandom(&q->reorder_cor, &q->prng)) { u64 now; s64 delay; delay = tabledist(q->latency, q->jitter, &q->delay_cor, &q->prng, q->delay_dist); now = ktime_get_ns(); if (q->rate) { struct netem_skb_cb *last = NULL; if (sch->q.tail) last = netem_skb_cb(sch->q.tail); if (q->t_root.rb_node) { struct sk_buff *t_skb; struct netem_skb_cb *t_last; t_skb = skb_rb_last(&q->t_root); t_last = netem_skb_cb(t_skb); if (!last || t_last->time_to_send > last->time_to_send) last = t_last; } if (q->t_tail) { struct netem_skb_cb *t_last = netem_skb_cb(q->t_tail); if (!last || t_last->time_to_send > last->time_to_send) last = t_last; } if (last) { /* * Last packet in queue is reference point (now), * calculate this time bonus and subtract * from delay. */ delay -= last->time_to_send - now; delay = max_t(s64, 0, delay); now = last->time_to_send; } delay += packet_time_ns(qdisc_pkt_len(skb), q); } cb->time_to_send = now + delay; ++q->counter; tfifo_enqueue(skb, sch); } else { /* * Do re-ordering by putting one out of N packets at the front * of the queue. */ cb->time_to_send = ktime_get_ns(); q->counter = 0; __qdisc_enqueue_head(skb, &sch->q); sch->qstats.requeues++; } finish_segs: if (skb2) __qdisc_drop(skb2, to_free); if (segs) { unsigned int len, last_len; int rc, nb; len = skb ? skb->len : 0; nb = skb ? 1 : 0; while (segs) { skb2 = segs->next; skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; last_len = segs->len; rc = qdisc_enqueue(segs, sch, to_free); if (rc != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(rc)) qdisc_qstats_drop(sch); } else { nb++; len += last_len; } segs = skb2; } /* Parent qdiscs accounted for 1 skb of size @prev_len */ qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len)); } else if (!skb) { return NET_XMIT_DROP; } return NET_XMIT_SUCCESS; } /* Delay the next round with a new future slot with a * correct number of bytes and packets. */ static void get_slot_next(struct netem_sched_data *q, u64 now) { s64 next_delay; if (!q->slot_dist) next_delay = q->slot_config.min_delay + (get_random_u32() * (q->slot_config.max_delay - q->slot_config.min_delay) >> 32); else next_delay = tabledist(q->slot_config.dist_delay, (s32)(q->slot_config.dist_jitter), NULL, &q->prng, q->slot_dist); q->slot.slot_next = now + next_delay; q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; } static struct sk_buff *netem_peek(struct netem_sched_data *q) { struct sk_buff *skb = skb_rb_first(&q->t_root); u64 t1, t2; if (!skb) return q->t_head; if (!q->t_head) return skb; t1 = netem_skb_cb(skb)->time_to_send; t2 = netem_skb_cb(q->t_head)->time_to_send; if (t1 < t2) return skb; return q->t_head; } static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb) { if (skb == q->t_head) { q->t_head = skb->next; if (!q->t_head) q->t_tail = NULL; } else { rb_erase(&skb->rbnode, &q->t_root); } } static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; tfifo_dequeue: skb = __qdisc_dequeue_head(&sch->q); if (skb) { deliver: qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); return skb; } skb = netem_peek(q); if (skb) { u64 time_to_send; u64 now = ktime_get_ns(); /* if more time remaining? */ time_to_send = netem_skb_cb(skb)->time_to_send; if (q->slot.slot_next && q->slot.slot_next < time_to_send) get_slot_next(q, now); if (time_to_send <= now && q->slot.slot_next <= now) { netem_erase_head(q, skb); q->t_len--; skb->next = NULL; skb->prev = NULL; /* skb->dev shares skb->rbnode area, * we need to restore its value. */ skb->dev = qdisc_dev(sch); if (q->slot.slot_next) { q->slot.packets_left--; q->slot.bytes_left -= qdisc_pkt_len(skb); if (q->slot.packets_left <= 0 || q->slot.bytes_left <= 0) get_slot_next(q, now); } if (q->qdisc) { unsigned int pkt_len = qdisc_pkt_len(skb); struct sk_buff *to_free = NULL; int err; err = qdisc_enqueue(skb, q->qdisc, &to_free); kfree_skb_list(to_free); if (err != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(err)) qdisc_qstats_drop(sch); sch->qstats.backlog -= pkt_len; sch->q.qlen--; qdisc_tree_reduce_backlog(sch, 1, pkt_len); } goto tfifo_dequeue; } sch->q.qlen--; goto deliver; } if (q->qdisc) { skb = q->qdisc->ops->dequeue(q->qdisc); if (skb) { sch->q.qlen--; goto deliver; } } qdisc_watchdog_schedule_ns(&q->watchdog, max(time_to_send, q->slot.slot_next)); } if (q->qdisc) { skb = q->qdisc->ops->dequeue(q->qdisc); if (skb) { sch->q.qlen--; goto deliver; } } return NULL; } static void netem_reset(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); qdisc_reset_queue(sch); tfifo_reset(sch); if (q->qdisc) qdisc_reset(q->qdisc); qdisc_watchdog_cancel(&q->watchdog); } static void dist_free(struct disttable *d) { kvfree(d); } /* * Distribution data is a variable size payload containing * signed 16 bit values. */ static int get_dist_table(struct disttable **tbl, const struct nlattr *attr) { size_t n = nla_len(attr)/sizeof(__s16); const __s16 *data = nla_data(attr); struct disttable *d; int i; if (!n || n > NETEM_DIST_MAX) return -EINVAL; d = kvmalloc(struct_size(d, table, n), GFP_KERNEL); if (!d) return -ENOMEM; d->size = n; for (i = 0; i < n; i++) d->table[i] = data[i]; *tbl = d; return 0; } static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_slot *c = nla_data(attr); q->slot_config = *c; if (q->slot_config.max_packets == 0) q->slot_config.max_packets = INT_MAX; if (q->slot_config.max_bytes == 0) q->slot_config.max_bytes = INT_MAX; /* capping dist_jitter to the range acceptable by tabledist() */ q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter)); q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; if (q->slot_config.min_delay | q->slot_config.max_delay | q->slot_config.dist_jitter) q->slot.slot_next = ktime_get_ns(); else q->slot.slot_next = 0; } static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_corr *c = nla_data(attr); init_crandom(&q->delay_cor, c->delay_corr); init_crandom(&q->loss_cor, c->loss_corr); init_crandom(&q->dup_cor, c->dup_corr); } static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_reorder *r = nla_data(attr); q->reorder = r->probability; init_crandom(&q->reorder_cor, r->correlation); } static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_corrupt *r = nla_data(attr); q->corrupt = r->probability; init_crandom(&q->corrupt_cor, r->correlation); } static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_rate *r = nla_data(attr); q->rate = r->rate; q->packet_overhead = r->packet_overhead; q->cell_size = r->cell_size; q->cell_overhead = r->cell_overhead; if (q->cell_size) q->cell_size_reciprocal = reciprocal_value(q->cell_size); else q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; } static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) { const struct nlattr *la; int rem; nla_for_each_nested(la, attr, rem) { u16 type = nla_type(la); switch (type) { case NETEM_LOSS_GI: { const struct tc_netem_gimodel *gi = nla_data(la); if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { pr_info("netem: incorrect gi model size\n"); return -EINVAL; } q->loss_model = CLG_4_STATES; q->clg.state = TX_IN_GAP_PERIOD; q->clg.a1 = gi->p13; q->clg.a2 = gi->p31; q->clg.a3 = gi->p32; q->clg.a4 = gi->p14; q->clg.a5 = gi->p23; break; } case NETEM_LOSS_GE: { const struct tc_netem_gemodel *ge = nla_data(la); if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { pr_info("netem: incorrect ge model size\n"); return -EINVAL; } q->loss_model = CLG_GILB_ELL; q->clg.state = GOOD_STATE; q->clg.a1 = ge->p; q->clg.a2 = ge->r; q->clg.a3 = ge->h; q->clg.a4 = ge->k1; break; } default: pr_info("netem: unknown loss type %u\n", type); return -EINVAL; } } return 0; } static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, [TCA_NETEM_ECN] = { .type = NLA_U32 }, [TCA_NETEM_RATE64] = { .type = NLA_U64 }, [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, [TCA_NETEM_PRNG_SEED] = { .type = NLA_U64 }, }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, const struct nla_policy *policy, int len) { int nested_len = nla_len(nla) - NLA_ALIGN(len); if (nested_len < 0) { pr_info("netem: invalid attributes len %d\n", nested_len); return -EINVAL; } if (nested_len >= nla_attr_size(0)) return nla_parse_deprecated(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), nested_len, policy, NULL); memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); return 0; } /* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct netem_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_NETEM_MAX + 1]; struct disttable *delay_dist = NULL; struct disttable *slot_dist = NULL; struct tc_netem_qopt *qopt; struct clgstate old_clg; int old_loss_model = CLG_RANDOM; int ret; qopt = nla_data(opt); ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); if (ret < 0) return ret; if (tb[TCA_NETEM_DELAY_DIST]) { ret = get_dist_table(&delay_dist, tb[TCA_NETEM_DELAY_DIST]); if (ret) goto table_free; } if (tb[TCA_NETEM_SLOT_DIST]) { ret = get_dist_table(&slot_dist, tb[TCA_NETEM_SLOT_DIST]); if (ret) goto table_free; } sch_tree_lock(sch); /* backup q->clg and q->loss_model */ old_clg = q->clg; old_loss_model = q->loss_model; if (tb[TCA_NETEM_LOSS]) { ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); if (ret) { q->loss_model = old_loss_model; q->clg = old_clg; goto unlock; } } else { q->loss_model = CLG_RANDOM; } if (delay_dist) swap(q->delay_dist, delay_dist); if (slot_dist) swap(q->slot_dist, slot_dist); sch->limit = qopt->limit; q->latency = PSCHED_TICKS2NS(qopt->latency); q->jitter = PSCHED_TICKS2NS(qopt->jitter); q->limit = qopt->limit; q->gap = qopt->gap; q->counter = 0; q->loss = qopt->loss; q->duplicate = qopt->duplicate; /* for compatibility with earlier versions. * if gap is set, need to assume 100% probability */ if (q->gap) q->reorder = ~0; if (tb[TCA_NETEM_CORR]) get_correlation(q, tb[TCA_NETEM_CORR]); if (tb[TCA_NETEM_REORDER]) get_reorder(q, tb[TCA_NETEM_REORDER]); if (tb[TCA_NETEM_CORRUPT]) get_corrupt(q, tb[TCA_NETEM_CORRUPT]); if (tb[TCA_NETEM_RATE]) get_rate(q, tb[TCA_NETEM_RATE]); if (tb[TCA_NETEM_RATE64]) q->rate = max_t(u64, q->rate, nla_get_u64(tb[TCA_NETEM_RATE64])); if (tb[TCA_NETEM_LATENCY64]) q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]); if (tb[TCA_NETEM_JITTER64]) q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]); if (tb[TCA_NETEM_ECN]) q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); if (tb[TCA_NETEM_SLOT]) get_slot(q, tb[TCA_NETEM_SLOT]); /* capping jitter to the range acceptable by tabledist() */ q->jitter = min_t(s64, abs(q->jitter), INT_MAX); if (tb[TCA_NETEM_PRNG_SEED]) q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]); else q->prng.seed = get_random_u64(); prandom_seed_state(&q->prng.prng_state, q->prng.seed); unlock: sch_tree_unlock(sch); table_free: dist_free(delay_dist); dist_free(slot_dist); return ret; } static int netem_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct netem_sched_data *q = qdisc_priv(sch); int ret; qdisc_watchdog_init(&q->watchdog, sch); if (!opt) return -EINVAL; q->loss_model = CLG_RANDOM; ret = netem_change(sch, opt, extack); if (ret) pr_info("netem: change failed\n"); return ret; } static void netem_destroy(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); if (q->qdisc) qdisc_put(q->qdisc); dist_free(q->delay_dist); dist_free(q->slot_dist); } static int dump_loss_model(const struct netem_sched_data *q, struct sk_buff *skb) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS); if (nest == NULL) goto nla_put_failure; switch (q->loss_model) { case CLG_RANDOM: /* legacy loss model */ nla_nest_cancel(skb, nest); return 0; /* no data */ case CLG_4_STATES: { struct tc_netem_gimodel gi = { .p13 = q->clg.a1, .p31 = q->clg.a2, .p32 = q->clg.a3, .p14 = q->clg.a4, .p23 = q->clg.a5, }; if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) goto nla_put_failure; break; } case CLG_GILB_ELL: { struct tc_netem_gemodel ge = { .p = q->clg.a1, .r = q->clg.a2, .h = q->clg.a3, .k1 = q->clg.a4, }; if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) goto nla_put_failure; break; } } nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) { const struct netem_sched_data *q = qdisc_priv(sch); struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); struct tc_netem_qopt qopt; struct tc_netem_corr cor; struct tc_netem_reorder reorder; struct tc_netem_corrupt corrupt; struct tc_netem_rate rate; struct tc_netem_slot slot; qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency), UINT_MAX); qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter), UINT_MAX); qopt.limit = q->limit; qopt.loss = q->loss; qopt.gap = q->gap; qopt.duplicate = q->duplicate; if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) goto nla_put_failure; if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency)) goto nla_put_failure; if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter)) goto nla_put_failure; cor.delay_corr = q->delay_cor.rho; cor.loss_corr = q->loss_cor.rho; cor.dup_corr = q->dup_cor.rho; if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) goto nla_put_failure; reorder.probability = q->reorder; reorder.correlation = q->reorder_cor.rho; if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) goto nla_put_failure; corrupt.probability = q->corrupt; corrupt.correlation = q->corrupt_cor.rho; if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) goto nla_put_failure; if (q->rate >= (1ULL << 32)) { if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate, TCA_NETEM_PAD)) goto nla_put_failure; rate.rate = ~0U; } else { rate.rate = q->rate; } rate.packet_overhead = q->packet_overhead; rate.cell_size = q->cell_size; rate.cell_overhead = q->cell_overhead; if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) goto nla_put_failure; if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) goto nla_put_failure; if (dump_loss_model(q, skb) != 0) goto nla_put_failure; if (q->slot_config.min_delay | q->slot_config.max_delay | q->slot_config.dist_jitter) { slot = q->slot_config; if (slot.max_packets == INT_MAX) slot.max_packets = 0; if (slot.max_bytes == INT_MAX) slot.max_bytes = 0; if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot)) goto nla_put_failure; } if (nla_put_u64_64bit(skb, TCA_NETEM_PRNG_SEED, q->prng.seed, TCA_NETEM_PAD)) goto nla_put_failure; return nla_nest_end(skb, nla); nla_put_failure: nlmsg_trim(skb, nla); return -1; } static int netem_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct netem_sched_data *q = qdisc_priv(sch); if (cl != 1 || !q->qdisc) /* only one class */ return -ENOENT; tcm->tcm_handle |= TC_H_MIN(1); tcm->tcm_info = q->qdisc->handle; return 0; } static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct netem_sched_data *q = qdisc_priv(sch); *old = qdisc_replace(sch, new, &q->qdisc); return 0; } static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) { struct netem_sched_data *q = qdisc_priv(sch); return q->qdisc; } static unsigned long netem_find(struct Qdisc *sch, u32 classid) { return 1; } static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { if (!tc_qdisc_stats_dump(sch, 1, walker)) return; } } static const struct Qdisc_class_ops netem_class_ops = { .graft = netem_graft, .leaf = netem_leaf, .find = netem_find, .walk = netem_walk, .dump = netem_dump_class, }; static struct Qdisc_ops netem_qdisc_ops __read_mostly = { .id = "netem", .cl_ops = &netem_class_ops, .priv_size = sizeof(struct netem_sched_data), .enqueue = netem_enqueue, .dequeue = netem_dequeue, .peek = qdisc_peek_dequeued, .init = netem_init, .reset = netem_reset, .destroy = netem_destroy, .change = netem_change, .dump = netem_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("netem"); static int __init netem_module_init(void) { pr_info("netem: version " VERSION "\n"); return register_qdisc(&netem_qdisc_ops); } static void __exit netem_module_exit(void) { unregister_qdisc(&netem_qdisc_ops); } module_init(netem_module_init) module_exit(netem_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Network characteristics emulator qdisc"); |
115 115 74 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Sysfs attributes of bridge * Linux ethernet bridge * * Authors: * Stephen Hemminger <shemminger@osdl.org> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_bridge.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> #include <linux/times.h> #include <linux/sched/signal.h> #include "br_private.h" /* IMPORTANT: new bridge options must be added with netlink support only * please do not add new sysfs entries */ #define to_bridge(cd) ((struct net_bridge *)netdev_priv(to_net_dev(cd))) /* * Common code for storing bridge parameters. */ static ssize_t store_bridge_parm(struct device *d, const char *buf, size_t len, int (*set)(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack)) { struct net_bridge *br = to_bridge(d); struct netlink_ext_ack extack = {0}; unsigned long val; int err; if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; err = kstrtoul(buf, 0, &val); if (err != 0) return err; if (!rtnl_trylock()) return restart_syscall(); err = (*set)(br, val, &extack); if (!err) netdev_state_change(br->dev); if (extack._msg) { if (err) br_err(br, "%s\n", extack._msg); else br_warn(br, "%s\n", extack._msg); } rtnl_unlock(); return err ? err : len; } static ssize_t forward_delay_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); } static int set_forward_delay(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_forward_delay(br, val); } static ssize_t forward_delay_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_forward_delay); } static DEVICE_ATTR_RW(forward_delay); static ssize_t hello_time_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", jiffies_to_clock_t(to_bridge(d)->hello_time)); } static int set_hello_time(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_hello_time(br, val); } static ssize_t hello_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_hello_time); } static DEVICE_ATTR_RW(hello_time); static ssize_t max_age_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lu\n", jiffies_to_clock_t(to_bridge(d)->max_age)); } static int set_max_age(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_max_age(br, val); } static ssize_t max_age_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_max_age); } static DEVICE_ATTR_RW(max_age); static ssize_t ageing_time_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); } static int set_ageing_time(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_set_ageing_time(br, val); } static ssize_t ageing_time_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_ageing_time); } static DEVICE_ATTR_RW(ageing_time); static ssize_t stp_state_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->stp_enabled); } static int set_stp_state(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_stp_set_enabled(br, val, extack); } static ssize_t stp_state_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_stp_state); } static DEVICE_ATTR_RW(stp_state); static ssize_t group_fwd_mask_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%#x\n", br->group_fwd_mask); } static int set_group_fwd_mask(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { if (val & BR_GROUPFWD_RESTRICTED) return -EINVAL; br->group_fwd_mask = val; return 0; } static ssize_t group_fwd_mask_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_group_fwd_mask); } static DEVICE_ATTR_RW(group_fwd_mask); static ssize_t priority_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); } static int set_priority(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_stp_set_bridge_priority(br, (u16) val); return 0; } static ssize_t priority_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_priority); } static DEVICE_ATTR_RW(priority); static ssize_t root_id_show(struct device *d, struct device_attribute *attr, char *buf) { return br_show_bridge_id(buf, &to_bridge(d)->designated_root); } static DEVICE_ATTR_RO(root_id); static ssize_t bridge_id_show(struct device *d, struct device_attribute *attr, char *buf) { return br_show_bridge_id(buf, &to_bridge(d)->bridge_id); } static DEVICE_ATTR_RO(bridge_id); static ssize_t root_port_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->root_port); } static DEVICE_ATTR_RO(root_port); static ssize_t root_path_cost_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost); } static DEVICE_ATTR_RO(root_path_cost); static ssize_t topology_change_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", to_bridge(d)->topology_change); } static DEVICE_ATTR_RO(topology_change); static ssize_t topology_change_detected_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->topology_change_detected); } static DEVICE_ATTR_RO(topology_change_detected); static ssize_t hello_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer)); } static DEVICE_ATTR_RO(hello_timer); static ssize_t tcn_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer)); } static DEVICE_ATTR_RO(tcn_timer); static ssize_t topology_change_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer)); } static DEVICE_ATTR_RO(topology_change_timer); static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer)); } static DEVICE_ATTR_RO(gc_timer); static ssize_t group_addr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%pM\n", br->group_addr); } static ssize_t group_addr_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { struct net_bridge *br = to_bridge(d); u8 new_addr[6]; if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!mac_pton(buf, new_addr)) return -EINVAL; if (!is_link_local_ether_addr(new_addr)) return -EINVAL; if (new_addr[5] == 1 || /* 802.3x Pause address */ new_addr[5] == 2 || /* 802.3ad Slow protocols */ new_addr[5] == 3) /* 802.1X PAE address */ return -EINVAL; if (!rtnl_trylock()) return restart_syscall(); spin_lock_bh(&br->lock); ether_addr_copy(br->group_addr, new_addr); spin_unlock_bh(&br->lock); br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true); br_recalculate_fwd_mask(br); netdev_state_change(br->dev); rtnl_unlock(); return len; } static DEVICE_ATTR_RW(group_addr); static int set_flush(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct net_bridge_fdb_flush_desc desc = { .flags_mask = BIT(BR_FDB_STATIC) }; br_fdb_flush(br, &desc); return 0; } static ssize_t flush_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_flush); } static DEVICE_ATTR_WO(flush); static ssize_t no_linklocal_learn_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); } static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, extack); } static ssize_t no_linklocal_learn_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_no_linklocal_learn); } static DEVICE_ATTR_RW(no_linklocal_learn); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->multicast_ctx.multicast_router); } static int set_multicast_router(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_router(&br->multicast_ctx, val); } static ssize_t multicast_router_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_router); } static DEVICE_ATTR_RW(multicast_router); static ssize_t multicast_snooping_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED)); } static ssize_t multicast_snooping_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_multicast_toggle); } static DEVICE_ATTR_RW(multicast_snooping); static ssize_t multicast_query_use_ifaddr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)); } static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val); return 0; } static ssize_t multicast_query_use_ifaddr_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_use_ifaddr); } static DEVICE_ATTR_RW(multicast_query_use_ifaddr); static ssize_t multicast_querier_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier); } static int set_multicast_querier(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_querier(&br->multicast_ctx, val); } static ssize_t multicast_querier_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_querier); } static DEVICE_ATTR_RW(multicast_querier); static ssize_t hash_elasticity_show(struct device *d, struct device_attribute *attr, char *buf) { return sprintf(buf, "%u\n", RHT_ELASTICITY); } static int set_elasticity(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { /* 16 is RHT_ELASTICITY */ NL_SET_ERR_MSG_MOD(extack, "the hash_elasticity option has been deprecated and is always 16"); return 0; } static ssize_t hash_elasticity_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_elasticity); } static DEVICE_ATTR_RW(hash_elasticity); static ssize_t hash_max_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->hash_max); } static int set_hash_max(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->hash_max = val; return 0; } static ssize_t hash_max_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_hash_max); } static DEVICE_ATTR_RW(hash_max); static ssize_t multicast_igmp_version_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_igmp_version); } static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_igmp_version(&br->multicast_ctx, val); } static ssize_t multicast_igmp_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_igmp_version); } static DEVICE_ATTR_RW(multicast_igmp_version); static ssize_t multicast_last_member_count_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_last_member_count); } static int set_last_member_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_last_member_count = val; return 0; } static ssize_t multicast_last_member_count_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_last_member_count); } static DEVICE_ATTR_RW(multicast_last_member_count); static ssize_t multicast_startup_query_count_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count); } static int set_startup_query_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_startup_query_count = val; return 0; } static ssize_t multicast_startup_query_count_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_startup_query_count); } static DEVICE_ATTR_RW(multicast_startup_query_count); static ssize_t multicast_last_member_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval)); } static int set_last_member_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_last_member_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_last_member_interval); } static DEVICE_ATTR_RW(multicast_last_member_interval); static ssize_t multicast_membership_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval)); } static int set_membership_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_membership_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_membership_interval); } static DEVICE_ATTR_RW(multicast_membership_interval); static ssize_t multicast_querier_interval_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval)); } static int set_querier_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_querier_interval_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_querier_interval); } static DEVICE_ATTR_RW(multicast_querier_interval); static ssize_t multicast_query_interval_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval)); } static int set_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_multicast_set_query_intvl(&br->multicast_ctx, val); return 0; } static ssize_t multicast_query_interval_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_interval); } static DEVICE_ATTR_RW(multicast_query_interval); static ssize_t multicast_query_response_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval)); } static int set_query_response_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); return 0; } static ssize_t multicast_query_response_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_query_response_interval); } static DEVICE_ATTR_RW(multicast_query_response_interval); static ssize_t multicast_startup_query_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval)); } static int set_startup_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_multicast_set_startup_query_intvl(&br->multicast_ctx, val); return 0; } static ssize_t multicast_startup_query_interval_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_startup_query_interval); } static DEVICE_ATTR_RW(multicast_startup_query_interval); static ssize_t multicast_stats_enabled_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)); } static int set_stats_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!val); return 0; } static ssize_t multicast_stats_enabled_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_stats_enabled); } static DEVICE_ATTR_RW(multicast_stats_enabled); #if IS_ENABLED(CONFIG_IPV6) static ssize_t multicast_mld_version_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br->multicast_ctx.multicast_mld_version); } static int set_multicast_mld_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_multicast_set_mld_version(&br->multicast_ctx, val); } static ssize_t multicast_mld_version_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_multicast_mld_version); } static DEVICE_ATTR_RW(multicast_mld_version); #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES)); } static int set_nf_call_iptables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val); return 0; } static ssize_t nf_call_iptables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_iptables); } static DEVICE_ATTR_RW(nf_call_iptables); static ssize_t nf_call_ip6tables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES)); } static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val); return 0; } static ssize_t nf_call_ip6tables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_ip6tables); } static DEVICE_ATTR_RW(nf_call_ip6tables); static ssize_t nf_call_arptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES)); } static int set_nf_call_arptables(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val); return 0; } static ssize_t nf_call_arptables_store( struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_nf_call_arptables); } static DEVICE_ATTR_RW(nf_call_arptables); #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING static ssize_t vlan_filtering_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED)); } static ssize_t vlan_filtering_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_filter_toggle); } static DEVICE_ATTR_RW(vlan_filtering); static ssize_t vlan_protocol_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%#06x\n", ntohs(br->vlan_proto)); } static ssize_t vlan_protocol_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_set_proto); } static DEVICE_ATTR_RW(vlan_protocol); static ssize_t default_pvid_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%d\n", br->default_pvid); } static ssize_t default_pvid_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, br_vlan_set_default_pvid); } static DEVICE_ATTR_RW(default_pvid); static ssize_t vlan_stats_enabled_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED)); } static int set_vlan_stats_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_vlan_set_stats(br, val); } static ssize_t vlan_stats_enabled_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_vlan_stats_enabled); } static DEVICE_ATTR_RW(vlan_stats_enabled); static ssize_t vlan_stats_per_port_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); } static int set_vlan_stats_per_port(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { return br_vlan_set_stats_per_port(br, val); } static ssize_t vlan_stats_per_port_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { return store_bridge_parm(d, buf, len, set_vlan_stats_per_port); } static DEVICE_ATTR_RW(vlan_stats_per_port); #endif static struct attribute *bridge_attrs[] = { &dev_attr_forward_delay.attr, &dev_attr_hello_time.attr, &dev_attr_max_age.attr, &dev_attr_ageing_time.attr, &dev_attr_stp_state.attr, &dev_attr_group_fwd_mask.attr, &dev_attr_priority.attr, &dev_attr_bridge_id.attr, &dev_attr_root_id.attr, &dev_attr_root_path_cost.attr, &dev_attr_root_port.attr, &dev_attr_topology_change.attr, &dev_attr_topology_change_detected.attr, &dev_attr_hello_timer.attr, &dev_attr_tcn_timer.attr, &dev_attr_topology_change_timer.attr, &dev_attr_gc_timer.attr, &dev_attr_group_addr.attr, &dev_attr_flush.attr, &dev_attr_no_linklocal_learn.attr, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING &dev_attr_multicast_router.attr, &dev_attr_multicast_snooping.attr, &dev_attr_multicast_querier.attr, &dev_attr_multicast_query_use_ifaddr.attr, &dev_attr_hash_elasticity.attr, &dev_attr_hash_max.attr, &dev_attr_multicast_last_member_count.attr, &dev_attr_multicast_startup_query_count.attr, &dev_attr_multicast_last_member_interval.attr, &dev_attr_multicast_membership_interval.attr, &dev_attr_multicast_querier_interval.attr, &dev_attr_multicast_query_interval.attr, &dev_attr_multicast_query_response_interval.attr, &dev_attr_multicast_startup_query_interval.attr, &dev_attr_multicast_stats_enabled.attr, &dev_attr_multicast_igmp_version.attr, #if IS_ENABLED(CONFIG_IPV6) &dev_attr_multicast_mld_version.attr, #endif #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) &dev_attr_nf_call_iptables.attr, &dev_attr_nf_call_ip6tables.attr, &dev_attr_nf_call_arptables.attr, #endif #ifdef CONFIG_BRIDGE_VLAN_FILTERING &dev_attr_vlan_filtering.attr, &dev_attr_vlan_protocol.attr, &dev_attr_default_pvid.attr, &dev_attr_vlan_stats_enabled.attr, &dev_attr_vlan_stats_per_port.attr, #endif NULL }; static const struct attribute_group bridge_group = { .name = SYSFS_BRIDGE_ATTR, .attrs = bridge_attrs, }; /* * Export the forwarding information table as a binary file * The records are struct __fdb_entry. * * Returns the number of bytes read. */ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); struct net_bridge *br = to_bridge(dev); int n; /* must read whole records */ if (off % sizeof(struct __fdb_entry) != 0) return -EINVAL; n = br_fdb_fillbuf(br, buf, count / sizeof(struct __fdb_entry), off / sizeof(struct __fdb_entry)); if (n > 0) n *= sizeof(struct __fdb_entry); return n; } static const struct bin_attribute bridge_forward = { .attr = { .name = SYSFS_BRIDGE_FDB, .mode = 0444, }, .read_new = brforward_read, }; /* * Add entries in sysfs onto the existing network class device * for the bridge. * Adds a attribute group "bridge" containing tuning parameters. * Binary attribute containing the forward table * Sub directory to hold links to interfaces. * * Note: the ifobj exists only to be a subdirectory * to hold links. The ifobj exists in same data structure * as it's parent the bridge so reference counting works. */ int br_sysfs_addbr(struct net_device *dev) { struct kobject *brobj = &dev->dev.kobj; struct net_bridge *br = netdev_priv(dev); int err; err = sysfs_create_group(brobj, &bridge_group); if (err) { pr_info("%s: can't create group %s/%s\n", __func__, dev->name, bridge_group.name); goto out1; } err = sysfs_create_bin_file(brobj, &bridge_forward); if (err) { pr_info("%s: can't create attribute file %s/%s\n", __func__, dev->name, bridge_forward.attr.name); goto out2; } br->ifobj = kobject_create_and_add(SYSFS_BRIDGE_PORT_SUBDIR, brobj); if (!br->ifobj) { pr_info("%s: can't add kobject (directory) %s/%s\n", __func__, dev->name, SYSFS_BRIDGE_PORT_SUBDIR); err = -ENOMEM; goto out3; } return 0; out3: sysfs_remove_bin_file(&dev->dev.kobj, &bridge_forward); out2: sysfs_remove_group(&dev->dev.kobj, &bridge_group); out1: return err; } void br_sysfs_delbr(struct net_device *dev) { struct kobject *kobj = &dev->dev.kobj; struct net_bridge *br = netdev_priv(dev); kobject_put(br->ifobj); sysfs_remove_bin_file(kobj, &bridge_forward); sysfs_remove_group(kobj, &bridge_group); } |
1 175 1853 527 25900 576 6 554 15 4 508 5811 3216 3229 27177 25301 25410 5047 596 600 604 604 2749 2749 47 175 176 2135 2135 2137 2137 185 154 95 303 4 304 280 301 1253 1250 1084 185 1216 504 552 52 69 90 181 231 55 178 560 558 1048 1048 17 17 4 5 59 5 232 17 156 58 16 15 14 6 11 16 140 140 340 16 199 140 123 408 259 9 49 136 138 138 17 130 137 137 14 14 3 49 49 223 64 14 161 49 5 63 20945 36 2289 5821 3801 7 1727 724 2338 2320 16 69 20958 17 20958 25578 21042 77 5453 8 134 1911 258 5336 25252 2662 25 23187 65 64 1 123 123 123 70 1 123 1 1 1 1 1 1 167 168 69 159 292 168 1 123 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/bvec.h> #include <linux/fault-inject-usercopy.h> #include <linux/uio.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/scatterlist.h> #include <linux/instrumented.h> #include <linux/iov_iter.h> static __always_inline size_t copy_to_user_iter(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { if (should_fail_usercopy()) return len; if (access_ok(iter_to, len)) { from += progress; instrument_copy_to_user(iter_to, from, len); len = raw_copy_to_user(iter_to, from, len); } return len; } static __always_inline size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { ssize_t res; if (should_fail_usercopy()) return len; from += progress; res = copy_to_user_nofault(iter_to, from, len); return res < 0 ? len : res; } static __always_inline size_t copy_from_user_iter(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { size_t res = len; if (should_fail_usercopy()) return len; if (access_ok(iter_from, len)) { to += progress; instrument_copy_from_user_before(to, iter_from, len); res = raw_copy_from_user(to, iter_from, len); instrument_copy_from_user_after(to, iter_from, len, res); } return res; } static __always_inline size_t memcpy_to_iter(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { memcpy(iter_to, from + progress, len); return 0; } static __always_inline size_t memcpy_from_iter(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { memcpy(to + progress, iter_from, len); return 0; } /* * fault_in_iov_iter_readable - fault in iov iterator for reading * @i: iterator * @size: maximum length * * Fault in one or more iovecs of the given iov_iter, to a maximum length of * @size. For each iovec, fault in each page that constitutes the iovec. * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). * * Always returns 0 for non-userspace iterators. */ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_readable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_readable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_readable); /* * fault_in_iov_iter_writeable - fault in iov iterator for writing * @i: iterator * @size: maximum length * * Faults in the iterator using get_user_pages(), i.e., without triggering * hardware page faults. This is primarily useful when we already know that * some or all of the pages in @i aren't in memory. * * Returns the number of bytes not faulted in, like copy_to_user() and * copy_from_user(). * * Always returns 0 for non-user-space iterators. */ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_safe_writeable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_writeable); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_IOVEC, .nofault = false, .data_source = direction, .__iov = iov, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_init); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return iterate_and_advance(i, bytes, (void *)addr, copy_to_user_iter, memcpy_to_iter); } EXPORT_SYMBOL(_copy_to_iter); #ifdef CONFIG_ARCH_HAS_COPY_MC static __always_inline size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { if (access_ok(iter_to, len)) { from += progress; instrument_copy_to_user(iter_to, from, len); len = copy_mc_to_user(iter_to, from, len); } return len; } static __always_inline size_t memcpy_to_iter_mc(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { return copy_mc_to_kernel(iter_to, from + progress, len); } /** * _copy_mc_to_iter - copy to iter with source memory error exception handling * @addr: source kernel address * @bytes: total transfer length * @i: destination iterator * * The pmem driver deploys this for the dax operation * (dax_copy_to_iter()) for dax reads (bypass page-cache and the * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes * successfully copied. * * The main differences between this and typical _copy_to_iter(). * * * Typical tail/residue handling after a fault retries the copy * byte-by-byte until the fault happens again. Re-triggering machine * checks is potentially fatal so the implementation uses source * alignment and poison alignment assumptions to avoid re-triggering * hardware exceptions. * * * ITER_KVEC and ITER_BVEC can return short copies. Compare to * copy_to_iter() where only ITER_IOVEC attempts might return a short copy. * * Return: number of bytes copied (may be %0) */ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return iterate_and_advance(i, bytes, (void *)addr, copy_to_user_iter_mc, memcpy_to_iter_mc); } EXPORT_SYMBOL_GPL(_copy_mc_to_iter); #endif /* CONFIG_ARCH_HAS_COPY_MC */ static __always_inline size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { return iterate_and_advance(i, bytes, addr, copy_from_user_iter, memcpy_from_iter); } size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return __copy_from_iter(addr, bytes, i); } EXPORT_SYMBOL(_copy_from_iter); static __always_inline size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { return __copy_from_user_inatomic_nocache(to + progress, iter_from, len); } size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; return iterate_and_advance(i, bytes, addr, copy_from_user_iter_nocache, memcpy_from_iter); } EXPORT_SYMBOL(_copy_from_iter_nocache); #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE static __always_inline size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { return __copy_from_user_flushcache(to + progress, iter_from, len); } static __always_inline size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { memcpy_flushcache(to + progress, iter_from, len); return 0; } /** * _copy_from_iter_flushcache - write destination through cpu cache * @addr: destination kernel address * @bytes: total transfer length * @i: source iterator * * The pmem driver arranges for filesystem-dax to use this facility via * dax_copy_from_iter() for ensuring that writes to persistent memory * are flushed through the CPU cache. It is differentiated from * _copy_from_iter_nocache() in that guarantees all data is flushed for * all iterator types. The _copy_from_iter_nocache() only attempts to * bypass the cache for the ITER_IOVEC case, and on some archs may use * instructions that strand dirty-data in the cache. * * Return: number of bytes copied (may be %0) */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; return iterate_and_advance(i, bytes, addr, copy_from_user_iter_flushcache, memcpy_from_iter_flushcache); } EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); #endif static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) { struct page *head; size_t v = n + offset; /* * The general case needs to access the page order in order * to compute the page size. * However, we mostly deal with order-0 pages and thus can * avoid a possible cache line miss for requests that fit all * page orders. */ if (n <= v && v <= PAGE_SIZE) return true; head = compound_head(page); v += (page - head) << PAGE_SHIFT; if (WARN_ON(n > v || v > page_size(head))) return false; return true; } size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_to_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = iterate_and_advance(i, n, kaddr + offset, copy_to_user_iter_nofault, memcpy_to_iter); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter_nofault); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_from_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_from_iter); static __always_inline size_t zero_to_user_iter(void __user *iter_to, size_t progress, size_t len, void *priv, void *priv2) { return clear_user(iter_to, len); } static __always_inline size_t zero_to_iter(void *iter_to, size_t progress, size_t len, void *priv, void *priv2) { memset(iter_to, 0, len); return 0; } size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { return iterate_and_advance(i, bytes, NULL, zero_to_user_iter, zero_to_iter); } EXPORT_SYMBOL(iov_iter_zero); size_t copy_page_from_iter_atomic(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t n, copied = 0; bool uses_kmap = IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(page); if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(!i->data_source)) return 0; do { char *p; n = bytes - copied; if (uses_kmap) { page += offset / PAGE_SIZE; offset %= PAGE_SIZE; n = min_t(size_t, n, PAGE_SIZE - offset); } p = kmap_atomic(page) + offset; n = __copy_from_iter(p, n, i); kunmap_atomic(p); copied += n; offset += n; } while (uses_kmap && copied != bytes && n > 0); return copied; } EXPORT_SYMBOL(copy_page_from_iter_atomic); static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) { const struct bio_vec *bvec, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { if (likely(size < bvec->bv_len)) break; size -= bvec->bv_len; } i->iov_offset = size; i->nr_segs -= bvec - i->bvec; i->bvec = bvec; } static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) { const struct iovec *iov, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; // from beginning of current segment for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) { if (likely(size < iov->iov_len)) break; size -= iov->iov_len; } i->iov_offset = size; i->nr_segs -= iov - iter_iov(i); i->__iov = iov; } static void iov_iter_folioq_advance(struct iov_iter *i, size_t size) { const struct folio_queue *folioq = i->folioq; unsigned int slot = i->folioq_slot; if (!i->count) return; i->count -= size; if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; slot = 0; } size += i->iov_offset; /* From beginning of current segment. */ do { size_t fsize = folioq_folio_size(folioq, slot); if (likely(size < fsize)) break; size -= fsize; slot++; if (slot >= folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } while (size); i->iov_offset = size; i->folioq_slot = slot; i->folioq = folioq; } void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(i->count < size)) size = i->count; if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { i->iov_offset += size; i->count -= size; } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { /* iovec and kvec have identical layouts */ iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); } else if (iov_iter_is_folioq(i)) { iov_iter_folioq_advance(i, size); } else if (iov_iter_is_discard(i)) { i->count -= size; } } EXPORT_SYMBOL(iov_iter_advance); static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll) { const struct folio_queue *folioq = i->folioq; unsigned int slot = i->folioq_slot; for (;;) { size_t fsize; if (slot == 0) { folioq = folioq->prev; slot = folioq_nr_slots(folioq); } slot--; fsize = folioq_folio_size(folioq, slot); if (unroll <= fsize) { i->iov_offset = fsize - unroll; break; } unroll -= fsize; } i->folioq_slot = slot; i->folioq = folioq; } void iov_iter_revert(struct iov_iter *i, size_t unroll) { if (!unroll) return; if (WARN_ON(unroll > MAX_RW_COUNT)) return; i->count += unroll; if (unlikely(iov_iter_is_discard(i))) return; if (unroll <= i->iov_offset) { i->iov_offset -= unroll; return; } unroll -= i->iov_offset; if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { BUG(); /* We should never go beyond the start of the specified * range since we might then be straying into pages that * aren't pinned. */ } else if (iov_iter_is_bvec(i)) { const struct bio_vec *bvec = i->bvec; while (1) { size_t n = (--bvec)->bv_len; i->nr_segs++; if (unroll <= n) { i->bvec = bvec; i->iov_offset = n - unroll; return; } unroll -= n; } } else if (iov_iter_is_folioq(i)) { i->iov_offset = 0; iov_iter_folioq_revert(i, unroll); } else { /* same logics for iovec and kvec */ const struct iovec *iov = iter_iov(i); while (1) { size_t n = (--iov)->iov_len; i->nr_segs++; if (unroll <= n) { i->__iov = iov; i->iov_offset = n - unroll; return; } unroll -= n; } } } EXPORT_SYMBOL(iov_iter_revert); /* * Return the count of just the current iov_iter segment. */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { if (i->nr_segs > 1) { if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return min(i->count, iter_iov(i)->iov_len - i->iov_offset); if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); } if (unlikely(iov_iter_is_folioq(i))) return !i->count ? 0 : umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count); return i->count; } EXPORT_SYMBOL(iov_iter_single_seg_count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_KVEC, .data_source = direction, .kvec = kvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_kvec); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_BVEC, .data_source = direction, .bvec = bvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_bvec); /** * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue * @i: The iterator to initialise. * @direction: The direction of the transfer. * @folioq: The starting point in the folio queue. * @first_slot: The first slot in the folio queue to use * @offset: The offset into the folio in the first slot to start at * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator to either draw data out of the pages attached to an * inode or to inject data into those pages. The pages *must* be prevented * from evaporation, either by taking a ref on them or locking them by the * caller. */ void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, const struct folio_queue *folioq, unsigned int first_slot, unsigned int offset, size_t count) { BUG_ON(direction & ~1); *i = (struct iov_iter) { .iter_type = ITER_FOLIOQ, .data_source = direction, .folioq = folioq, .folioq_slot = first_slot, .count = count, .iov_offset = offset, }; } EXPORT_SYMBOL(iov_iter_folio_queue); /** * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray * @i: The iterator to initialise. * @direction: The direction of the transfer. * @xarray: The xarray to access. * @start: The start file position. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator to either draw data out of the pages attached to an * inode or to inject data into those pages. The pages *must* be prevented * from evaporation, either by taking a ref on them or locking them by the * caller. */ void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count) { BUG_ON(direction & ~1); *i = (struct iov_iter) { .iter_type = ITER_XARRAY, .data_source = direction, .xarray = xarray, .xarray_start = start, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_xarray); /** * iov_iter_discard - Initialise an I/O iterator that discards data * @i: The iterator to initialise. * @direction: The direction of the transfer. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator that just discards everything that's written to it. * It's only available as a READ iterator. */ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) { BUG_ON(direction != READ); *i = (struct iov_iter){ .iter_type = ITER_DISCARD, .data_source = false, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_discard); static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { const struct iovec *iov = iter_iov(i); size_t size = i->count; size_t skip = i->iov_offset; do { size_t len = iov->iov_len - skip; if (len > size) len = size; if (len & len_mask) return false; if ((unsigned long)(iov->iov_base + skip) & addr_mask) return false; iov++; size -= len; skip = 0; } while (size); return true; } static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { const struct bio_vec *bvec = i->bvec; unsigned skip = i->iov_offset; size_t size = i->count; do { size_t len = bvec->bv_len; if (len > size) len = size; if (len & len_mask) return false; if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) return false; bvec++; size -= len; skip = 0; } while (size); return true; } /** * iov_iter_is_aligned() - Check if the addresses and lengths of each segments * are aligned to the parameters. * * @i: &struct iov_iter to restore * @addr_mask: bit mask to check against the iov element's addresses * @len_mask: bit mask to check against the iov element's lengths * * Return: false if any addresses or lengths intersect with the provided masks */ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { if (likely(iter_is_ubuf(i))) { if (i->count & len_mask) return false; if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) return false; return true; } if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_aligned_iovec(i, addr_mask, len_mask); if (iov_iter_is_bvec(i)) return iov_iter_aligned_bvec(i, addr_mask, len_mask); /* With both xarray and folioq types, we're dealing with whole folios. */ if (iov_iter_is_xarray(i)) { if (i->count & len_mask) return false; if ((i->xarray_start + i->iov_offset) & addr_mask) return false; } if (iov_iter_is_folioq(i)) { if (i->count & len_mask) return false; if (i->iov_offset & addr_mask) return false; } return true; } EXPORT_SYMBOL_GPL(iov_iter_is_aligned); static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { const struct iovec *iov = iter_iov(i); unsigned long res = 0; size_t size = i->count; size_t skip = i->iov_offset; do { size_t len = iov->iov_len - skip; if (len) { res |= (unsigned long)iov->iov_base + skip; if (len > size) len = size; res |= len; size -= len; } iov++; skip = 0; } while (size); return res; } static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) { const struct bio_vec *bvec = i->bvec; unsigned res = 0; size_t size = i->count; unsigned skip = i->iov_offset; do { size_t len = bvec->bv_len - skip; res |= (unsigned long)bvec->bv_offset + skip; if (len > size) len = size; res |= len; bvec++; size -= len; skip = 0; } while (size); return res; } unsigned long iov_iter_alignment(const struct iov_iter *i) { if (likely(iter_is_ubuf(i))) { size_t size = i->count; if (size) return ((unsigned long)i->ubuf + i->iov_offset) | size; return 0; } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_alignment_iovec(i); if (iov_iter_is_bvec(i)) return iov_iter_alignment_bvec(i); /* With both xarray and folioq types, we're dealing with whole folios. */ if (iov_iter_is_folioq(i)) return i->iov_offset | i->count; if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count; return 0; } EXPORT_SYMBOL(iov_iter_alignment); unsigned long iov_iter_gap_alignment(const struct iov_iter *i) { unsigned long res = 0; unsigned long v = 0; size_t size = i->count; unsigned k; if (iter_is_ubuf(i)) return 0; if (WARN_ON(!iter_is_iovec(i))) return ~0U; for (k = 0; k < i->nr_segs; k++) { const struct iovec *iov = iter_iov(i) + k; if (iov->iov_len) { unsigned long base = (unsigned long)iov->iov_base; if (v) // if not the first one res |= base | v; // this start | previous end v = base + iov->iov_len; if (size <= iov->iov_len) break; size -= iov->iov_len; } } return res; } EXPORT_SYMBOL(iov_iter_gap_alignment); static int want_pages_array(struct page ***res, size_t size, size_t start, unsigned int maxpages) { unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE); if (count > maxpages) count = maxpages; WARN_ON(!count); // caller should've prevented that if (!*res) { *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!*res) return 0; } return count; } static ssize_t iter_folioq_get_pages(struct iov_iter *iter, struct page ***ppages, size_t maxsize, unsigned maxpages, size_t *_start_offset) { const struct folio_queue *folioq = iter->folioq; struct page **pages; unsigned int slot = iter->folioq_slot; size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset; if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; slot = 0; if (WARN_ON(iov_offset != 0)) return -EIO; } maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages); if (!maxpages) return -ENOMEM; *_start_offset = iov_offset & ~PAGE_MASK; pages = *ppages; for (;;) { struct folio *folio = folioq_folio(folioq, slot); size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot); size_t part = PAGE_SIZE - offset % PAGE_SIZE; if (offset < fsize) { part = umin(part, umin(maxsize - extracted, fsize - offset)); count -= part; iov_offset += part; extracted += part; *pages = folio_page(folio, offset / PAGE_SIZE); get_page(*pages); pages++; maxpages--; } if (maxpages == 0 || extracted >= maxsize) break; if (iov_offset >= fsize) { iov_offset = 0; slot++; if (slot == folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } } iter->count = count; iter->iov_offset = iov_offset; iter->folioq = folioq; iter->folioq_slot = slot; return extracted; } static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, pgoff_t index, unsigned int nr_pages) { XA_STATE(xas, xa, index); struct page *page; unsigned int ret = 0; rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { if (xas_retry(&xas, page)) continue; /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) { xas_reset(&xas); continue; } pages[ret] = find_subpage(page, xas.xa_index); get_page(pages[ret]); if (++ret == nr_pages) break; } rcu_read_unlock(); return ret; } static ssize_t iter_xarray_get_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned maxpages, size_t *_start_offset) { unsigned nr, offset, count; pgoff_t index; loff_t pos; pos = i->xarray_start + i->iov_offset; index = pos >> PAGE_SHIFT; offset = pos & ~PAGE_MASK; *_start_offset = offset; count = want_pages_array(pages, maxsize, offset, maxpages); if (!count) return -ENOMEM; nr = iter_xarray_populate_pages(*pages, i->xarray, index, count); if (nr == 0) return 0; maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); i->iov_offset += maxsize; i->count -= maxsize; return maxsize; } /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) { size_t skip; long k; if (iter_is_ubuf(i)) return (unsigned long)i->ubuf + i->iov_offset; for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { const struct iovec *iov = iter_iov(i) + k; size_t len = iov->iov_len - skip; if (unlikely(!len)) continue; if (*size > len) *size = len; return (unsigned long)iov->iov_base + skip; } BUG(); // if it had been empty, we wouldn't get called } /* must be done on non-empty ITER_BVEC one */ static struct page *first_bvec_segment(const struct iov_iter *i, size_t *size, size_t *start) { struct page *page; size_t skip = i->iov_offset, len; len = i->bvec->bv_len - skip; if (*size > len) *size = len; skip += i->bvec->bv_offset; page = i->bvec->bv_page + skip / PAGE_SIZE; *start = skip % PAGE_SIZE; return page; } static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, size_t *start) { unsigned int n, gup_flags = 0; if (maxsize > i->count) maxsize = i->count; if (!maxsize) return 0; if (maxsize > MAX_RW_COUNT) maxsize = MAX_RW_COUNT; if (likely(user_backed_iter(i))) { unsigned long addr; int res; if (iov_iter_rw(i) != WRITE) gup_flags |= FOLL_WRITE; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *start = addr % PAGE_SIZE; addr &= PAGE_MASK; n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; res = get_user_pages_fast(addr, n, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start); iov_iter_advance(i, maxsize); return maxsize; } if (iov_iter_is_bvec(i)) { struct page **p; struct page *page; page = first_bvec_segment(i, &maxsize, start); n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; p = *pages; for (int k = 0; k < n; k++) { struct folio *folio = page_folio(page + k); p[k] = page + k; if (!folio_test_slab(folio)) folio_get(folio); } maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); i->count -= maxsize; i->iov_offset += maxsize; if (i->iov_offset == i->bvec->bv_len) { i->iov_offset = 0; i->bvec++; i->nr_segs--; } return maxsize; } if (iov_iter_is_folioq(i)) return iter_folioq_get_pages(i, pages, maxsize, maxpages, start); if (iov_iter_is_xarray(i)) return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); return -EFAULT; } ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { if (!maxpages) return 0; BUG_ON(!pages); return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start); } EXPORT_SYMBOL(iov_iter_get_pages2); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) { ssize_t len; *pages = NULL; len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start); if (len <= 0) { kvfree(*pages); *pages = NULL; } return len; } EXPORT_SYMBOL(iov_iter_get_pages_alloc2); static int iov_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct iovec *p; int npages = 0; for (p = iter_iov(i); size; skip = 0, p++) { unsigned offs = offset_in_page(p->iov_base + skip); size_t len = min(p->iov_len - skip, size); if (len) { size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } } return npages; } static int bvec_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct bio_vec *p; int npages = 0; for (p = i->bvec; size; skip = 0, p++) { unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; size_t len = min(p->bv_len - skip, size); size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } return npages; } int iov_iter_npages(const struct iov_iter *i, int maxpages) { if (unlikely(!i->count)) return 0; if (likely(iter_is_ubuf(i))) { unsigned offs = offset_in_page(i->ubuf + i->iov_offset); int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); return min(npages, maxpages); } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_npages(i, maxpages); if (iov_iter_is_bvec(i)) return bvec_npages(i, maxpages); if (iov_iter_is_folioq(i)) { unsigned offset = i->iov_offset % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); return min(npages, maxpages); } if (iov_iter_is_xarray(i)) { unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); return min(npages, maxpages); } return 0; } EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), flags); else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) /* iovec and kvec have identical layout */ return new->__iov = kmemdup(new->__iov, new->nr_segs * sizeof(struct iovec), flags); return NULL; } EXPORT_SYMBOL(dup_iter); static __noclone int copy_compat_iovec_from_user(struct iovec *iov, const struct iovec __user *uvec, u32 nr_segs) { const struct compat_iovec __user *uiov = (const struct compat_iovec __user *)uvec; int ret = -EFAULT; u32 i; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; for (i = 0; i < nr_segs; i++) { compat_uptr_t buf; compat_ssize_t len; unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); /* check for compat_size_t not fitting in compat_ssize_t .. */ if (len < 0) { ret = -EINVAL; goto uaccess_end; } iov[i].iov_base = compat_ptr(buf); iov[i].iov_len = len; } ret = 0; uaccess_end: user_access_end(); return ret; } static __noclone int copy_iovec_from_user(struct iovec *iov, const struct iovec __user *uiov, unsigned long nr_segs) { int ret = -EFAULT; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; do { void __user *buf; ssize_t len; unsafe_get_user(len, &uiov->iov_len, uaccess_end); unsafe_get_user(buf, &uiov->iov_base, uaccess_end); /* check for size_t not fitting in ssize_t .. */ if (unlikely(len < 0)) { ret = -EINVAL; goto uaccess_end; } iov->iov_base = buf; iov->iov_len = len; uiov++; iov++; } while (--nr_segs); ret = 0; uaccess_end: user_access_end(); return ret; } struct iovec *iovec_from_user(const struct iovec __user *uvec, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat) { struct iovec *iov = fast_iov; int ret; /* * SuS says "The readv() function *may* fail if the iovcnt argument was * less than or equal to 0, or greater than {IOV_MAX}. Linux has * traditionally returned zero for zero segments, so... */ if (nr_segs == 0) return iov; if (nr_segs > UIO_MAXIOV) return ERR_PTR(-EINVAL); if (nr_segs > fast_segs) { iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); if (!iov) return ERR_PTR(-ENOMEM); } if (unlikely(compat)) ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); else ret = copy_iovec_from_user(iov, uvec, nr_segs); if (ret) { if (iov != fast_iov) kfree(iov); return ERR_PTR(ret); } return iov; } /* * Single segment iovec supplied by the user, import it as ITER_UBUF. */ static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, struct iovec **iovp, struct iov_iter *i, bool compat) { struct iovec *iov = *iovp; ssize_t ret; *iovp = NULL; if (compat) ret = copy_compat_iovec_from_user(iov, uvec, 1); else ret = copy_iovec_from_user(iov, uvec, 1); if (unlikely(ret)) return ret; ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); if (unlikely(ret)) return ret; return i->count; } ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat) { ssize_t total_len = 0; unsigned long seg; struct iovec *iov; if (nr_segs == 1) return __import_iovec_ubuf(type, uvec, iovp, i, compat); iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); if (IS_ERR(iov)) { *iovp = NULL; return PTR_ERR(iov); } /* * According to the Single Unix Specification we should return EINVAL if * an element length is < 0 when cast to ssize_t or if the total length * would overflow the ssize_t return value of the system call. * * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the * overflow case. */ for (seg = 0; seg < nr_segs; seg++) { ssize_t len = (ssize_t)iov[seg].iov_len; if (!access_ok(iov[seg].iov_base, len)) { if (iov != *iovp) kfree(iov); *iovp = NULL; return -EFAULT; } if (len > MAX_RW_COUNT - total_len) { len = MAX_RW_COUNT - total_len; iov[seg].iov_len = len; } total_len += len; } iov_iter_init(i, type, iov, nr_segs, total_len); if (iov == *iovp) *iovp = NULL; else *iovp = iov; return total_len; } /** * import_iovec() - Copy an array of &struct iovec from userspace * into the kernel, check that it is valid, and initialize a new * &struct iov_iter iterator to access it. * * @type: One of %READ or %WRITE. * @uvec: Pointer to the userspace array. * @nr_segs: Number of elements in userspace array. * @fast_segs: Number of elements in @iov. * @iovp: (input and output parameter) Pointer to pointer to (usually small * on-stack) kernel array. * @i: Pointer to iterator that will be initialized on success. * * If the array pointed to by *@iov is large enough to hold all @nr_segs, * then this function places %NULL in *@iov on return. Otherwise, a new * array will be allocated and the result placed in *@iov. This means that * the caller may call kfree() on *@iov regardless of whether the small * on-stack array was used or not (and regardless of whether this function * returns an error or not). * * Return: Negative error code on error, bytes imported on success */ ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i) { return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, in_compat_syscall()); } EXPORT_SYMBOL(import_iovec); int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) { if (len > MAX_RW_COUNT) len = MAX_RW_COUNT; if (unlikely(!access_ok(buf, len))) return -EFAULT; iov_iter_ubuf(i, rw, buf, len); return 0; } EXPORT_SYMBOL_GPL(import_ubuf); /** * iov_iter_restore() - Restore a &struct iov_iter to the same state as when * iov_iter_save_state() was called. * * @i: &struct iov_iter to restore * @state: state to restore from * * Used after iov_iter_save_state() to bring restore @i, if operations may * have advanced it. * * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC */ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) { if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) return; i->iov_offset = state->iov_offset; i->count = state->count; if (iter_is_ubuf(i)) return; /* * For the *vec iters, nr_segs + iov is constant - if we increment * the vec, then we also decrement the nr_segs count. Hence we don't * need to track both of these, just one is enough and we can deduct * the other from that. ITER_KVEC and ITER_IOVEC are the same struct * size, so we can just increment the iov pointer as they are unionzed. * ITER_BVEC _may_ be the same size on some archs, but on others it is * not. Be safe and handle it separately. */ BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); if (iov_iter_is_bvec(i)) i->bvec -= state->nr_segs - i->nr_segs; else i->__iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } /* * Extract a list of contiguous pages from an ITER_FOLIOQ iterator. This does * not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { const struct folio_queue *folioq = i->folioq; struct page **p; unsigned int nr = 0; size_t extracted = 0, offset, slot = i->folioq_slot; if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; slot = 0; if (WARN_ON(i->iov_offset != 0)) return -EIO; } offset = i->iov_offset & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; for (;;) { struct folio *folio = folioq_folio(folioq, slot); size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot); size_t part = PAGE_SIZE - offset % PAGE_SIZE; if (offset < fsize) { part = umin(part, umin(maxsize - extracted, fsize - offset)); i->count -= part; i->iov_offset += part; extracted += part; p[nr++] = folio_page(folio, offset / PAGE_SIZE); } if (nr >= maxpages || extracted >= maxsize) break; if (i->iov_offset >= fsize) { i->iov_offset = 0; slot++; if (slot == folioq_nr_slots(folioq) && folioq->next) { folioq = folioq->next; slot = 0; } } } i->folioq = folioq; i->folioq_slot = slot; return extracted; } /* * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not * get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page *page, **p; unsigned int nr = 0, offset; loff_t pos = i->xarray_start + i->iov_offset; pgoff_t index = pos >> PAGE_SHIFT; XA_STATE(xas, i->xarray, index); offset = pos & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { if (xas_retry(&xas, page)) continue; /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) { xas_reset(&xas); continue; } p[nr++] = find_subpage(page, xas.xa_index); if (nr == maxpages) break; } rcu_read_unlock(); maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); iov_iter_advance(i, maxsize); return maxsize; } /* * Extract a list of virtually contiguous pages from an ITER_BVEC iterator. * This does not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { size_t skip = i->iov_offset, size = 0; struct bvec_iter bi; int k = 0; if (i->nr_segs == 0) return 0; if (i->iov_offset == i->bvec->bv_len) { i->iov_offset = 0; i->nr_segs--; i->bvec++; skip = 0; } bi.bi_idx = 0; bi.bi_size = maxsize; bi.bi_bvec_done = skip; maxpages = want_pages_array(pages, maxsize, skip, maxpages); while (bi.bi_size && bi.bi_idx < i->nr_segs) { struct bio_vec bv = bvec_iter_bvec(i->bvec, bi); /* * The iov_iter_extract_pages interface only allows an offset * into the first page. Break out of the loop if we see an * offset into subsequent pages, the caller will have to call * iov_iter_extract_pages again for the reminder. */ if (k) { if (bv.bv_offset) break; } else { *offset0 = bv.bv_offset; } (*pages)[k++] = bv.bv_page; size += bv.bv_len; if (k >= maxpages) break; /* * We are done when the end of the bvec doesn't align to a page * boundary as that would create a hole in the returned space. * The caller will handle this with another call to * iov_iter_extract_pages. */ if (bv.bv_offset + bv.bv_len != PAGE_SIZE) break; bvec_iter_advance_single(i->bvec, &bi, bv.bv_len); } iov_iter_advance(i, size); return size; } /* * Extract a list of virtually contiguous pages from an ITER_KVEC iterator. * This does not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page **p, *page; const void *kaddr; size_t skip = i->iov_offset, offset, len, size; int k; for (;;) { if (i->nr_segs == 0) return 0; size = min(maxsize, i->kvec->iov_len - skip); if (size) break; i->iov_offset = 0; i->nr_segs--; i->kvec++; skip = 0; } kaddr = i->kvec->iov_base + skip; offset = (unsigned long)kaddr & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, size, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; kaddr -= offset; len = offset + size; for (k = 0; k < maxpages; k++) { size_t seg = min_t(size_t, len, PAGE_SIZE); if (is_vmalloc_or_module_addr(kaddr)) page = vmalloc_to_page(kaddr); else page = virt_to_page(kaddr); p[k] = page; len -= seg; kaddr += PAGE_SIZE; } size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); iov_iter_advance(i, size); return size; } /* * Extract a list of contiguous pages from a user iterator and get a pin on * each of them. This should only be used if the iterator is user-backed * (IOBUF/UBUF). * * It does not get refs on the pages, but the pages must be unpinned by the * caller once the transfer is complete. * * This is safe to be used where background IO/DMA *is* going to be modifying * the buffer; using a pin rather than a ref makes forces fork() to give the * child a copy of the page. */ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { unsigned long addr; unsigned int gup_flags = 0; size_t offset; int res; if (i->data_source == ITER_DEST) gup_flags |= FOLL_WRITE; if (extraction_flags & ITER_ALLOW_P2PDMA) gup_flags |= FOLL_PCI_P2PDMA; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *offset0 = offset = addr % PAGE_SIZE; addr &= PAGE_MASK; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); iov_iter_advance(i, maxsize); return maxsize; } /** * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator * @i: The iterator to extract from * @pages: Where to return the list of pages * @maxsize: The maximum amount of iterator to extract * @maxpages: The maximum size of the list of pages * @extraction_flags: Flags to qualify request * @offset0: Where to return the starting offset into (*@pages)[0] * * Extract a list of contiguous pages from the current point of the iterator, * advancing the iterator. The maximum number of pages and the maximum amount * of page contents can be set. * * If *@pages is NULL, a page list will be allocated to the required size and * *@pages will be set to its base. If *@pages is not NULL, it will be assumed * that the caller allocated a page list at least @maxpages in size and this * will be filled in. * * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA * be allowed on the pages extracted. * * The iov_iter_extract_will_pin() function can be used to query how cleanup * should be performed. * * Extra refs or pins on the pages may be obtained as follows: * * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be * added to the pages, but refs will not be taken. * iov_iter_extract_will_pin() will return true. * * (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the * pages are merely listed; no extra refs or pins are obtained. * iov_iter_extract_will_pin() will return 0. * * Note also: * * (*) Use with ITER_DISCARD is not supported as that has no content. * * On success, the function sets *@pages to the new pagelist, if allocated, and * sets *offset0 to the offset into the first page. * * It may also return -ENOMEM and -EFAULT. */ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); if (!maxsize) return 0; if (likely(user_backed_iter(i))) return iov_iter_extract_user_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_kvec(i)) return iov_iter_extract_kvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_bvec(i)) return iov_iter_extract_bvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_folioq(i)) return iov_iter_extract_folioq_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_xarray(i)) return iov_iter_extract_xarray_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); return -EFAULT; } EXPORT_SYMBOL_GPL(iov_iter_extract_pages); |
18953 15 78 15 12 13 7988 12 2 1158 13 46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SEQLOCK_H #define __LINUX_SEQLOCK_H /* * seqcount_t / seqlock_t - a reader-writer consistency mechanism with * lockless readers (read-only retry loops), and no writer starvation. * * See Documentation/locking/seqlock.rst * * Copyrights: * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli * - Sequence counters with associated locks, (C) 2020 Linutronix GmbH */ #include <linux/compiler.h> #include <linux/kcsan-checks.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/preempt.h> #include <linux/seqlock_types.h> #include <linux/spinlock.h> #include <asm/processor.h> /* * The seqlock seqcount_t interface does not prescribe a precise sequence of * read begin/retry/end. For readers, typically there is a call to * read_seqcount_begin() and read_seqcount_retry(), however, there are more * esoteric cases which do not follow this pattern. * * As a consequence, we take the following best-effort approach for raw usage * via seqcount_t under KCSAN: upon beginning a seq-reader critical section, * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as * atomics; if there is a matching read_seqcount_retry() call, no following * memory operations are considered atomic. Usage of the seqlock_t interface * is not affected. */ #define KCSAN_SEQLOCK_REGION_MAX 1000 static inline void __seqcount_init(seqcount_t *s, const char *name, struct lock_class_key *key) { /* * Make sure we are not reinitializing a held lock: */ lockdep_init_map(&s->dep_map, name, key, 0); s->sequence = 0; } #ifdef CONFIG_DEBUG_LOCK_ALLOC # define SEQCOUNT_DEP_MAP_INIT(lockname) \ .dep_map = { .name = #lockname } /** * seqcount_init() - runtime initializer for seqcount_t * @s: Pointer to the seqcount_t instance */ # define seqcount_init(s) \ do { \ static struct lock_class_key __key; \ __seqcount_init((s), #s, &__key); \ } while (0) static inline void seqcount_lockdep_reader_access(const seqcount_t *s) { seqcount_t *l = (seqcount_t *)s; unsigned long flags; local_irq_save(flags); seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_); seqcount_release(&l->dep_map, _RET_IP_); local_irq_restore(flags); } #else # define SEQCOUNT_DEP_MAP_INIT(lockname) # define seqcount_init(s) __seqcount_init(s, NULL, NULL) # define seqcount_lockdep_reader_access(x) #endif /** * SEQCNT_ZERO() - static initializer for seqcount_t * @name: Name of the seqcount_t instance */ #define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) } /* * Sequence counters with associated locks (seqcount_LOCKNAME_t) * * A sequence counter which associates the lock used for writer * serialization at initialization time. This enables lockdep to validate * that the write side critical section is properly serialized. * * For associated locks which do not implicitly disable preemption, * preemption protection is enforced in the write side function. * * Lockdep is never used in any for the raw write variants. * * See Documentation/locking/seqlock.rst */ /* * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated * @seqcount: The real sequence counter * @lock: Pointer to the associated lock * * A plain sequence counter with external writer synchronization by * LOCKNAME @lock. The lock is associated to the sequence counter in the * static initializer or init function. This enables lockdep to validate * that the write side critical section is properly serialized. * * LOCKNAME: raw_spinlock, spinlock, rwlock or mutex */ /* * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t * @s: Pointer to the seqcount_LOCKNAME_t instance * @lock: Pointer to the associated lock */ #define seqcount_LOCKNAME_init(s, _lock, lockname) \ do { \ seqcount_##lockname##_t *____s = (s); \ seqcount_init(&____s->seqcount); \ __SEQ_LOCK(____s->lock = (_lock)); \ } while (0) #define seqcount_raw_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, raw_spinlock) #define seqcount_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, spinlock) #define seqcount_rwlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, rwlock) #define seqcount_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, mutex) /* * SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers * seqprop_LOCKNAME_*() - Property accessors for seqcount_LOCKNAME_t * * @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t * @locktype: LOCKNAME canonical C data type * @preemptible: preemptibility of above locktype * @lockbase: prefix for associated lock/unlock */ #define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase) \ static __always_inline seqcount_t * \ __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ { \ return &s->seqcount; \ } \ \ static __always_inline const seqcount_t * \ __seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s) \ { \ return &s->seqcount; \ } \ \ static __always_inline unsigned \ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ { \ unsigned seq = smp_load_acquire(&s->seqcount.sequence); \ \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ return seq; \ \ if (preemptible && unlikely(seq & 1)) { \ __SEQ_LOCK(lockbase##_lock(s->lock)); \ __SEQ_LOCK(lockbase##_unlock(s->lock)); \ \ /* \ * Re-read the sequence counter since the (possibly \ * preempted) writer made progress. \ */ \ seq = smp_load_acquire(&s->seqcount.sequence); \ } \ \ return seq; \ } \ \ static __always_inline bool \ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ { \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ return preemptible; \ \ /* PREEMPT_RT relies on the above LOCK+UNLOCK */ \ return false; \ } \ \ static __always_inline void \ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \ { \ __SEQ_LOCK(lockdep_assert_held(s->lock)); \ } /* * __seqprop() for seqcount_t */ static inline seqcount_t *__seqprop_ptr(seqcount_t *s) { return s; } static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s) { return s; } static inline unsigned __seqprop_sequence(const seqcount_t *s) { return smp_load_acquire(&s->sequence); } static inline bool __seqprop_preemptible(const seqcount_t *s) { return false; } static inline void __seqprop_assert(const seqcount_t *s) { lockdep_assert_preemption_disabled(); } #define __SEQ_RT IS_ENABLED(CONFIG_PREEMPT_RT) SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, raw_spin) SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, spin) SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, read) SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) #undef SEQCOUNT_LOCKNAME /* * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t * @name: Name of the seqcount_LOCKNAME_t instance * @lock: Pointer to the associated LOCKNAME */ #define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) { \ .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ __SEQ_LOCK(.lock = (assoc_lock)) \ } #define SEQCNT_RAW_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_RWLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define __seqprop_case(s, lockname, prop) \ seqcount_##lockname##_t: __seqprop_##lockname##_##prop #define __seqprop(s, prop) _Generic(*(s), \ seqcount_t: __seqprop_##prop, \ __seqprop_case((s), raw_spinlock, prop), \ __seqprop_case((s), spinlock, prop), \ __seqprop_case((s), rwlock, prop), \ __seqprop_case((s), mutex, prop)) #define seqprop_ptr(s) __seqprop(s, ptr)(s) #define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s) #define seqprop_sequence(s) __seqprop(s, sequence)(s) #define seqprop_preemptible(s) __seqprop(s, preemptible)(s) #define seqprop_assert(s) __seqprop(s, assert)(s) /** * __read_seqcount_begin() - begin a seqcount_t read section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Return: count to be passed to read_seqcount_retry() */ #define __read_seqcount_begin(s) \ ({ \ unsigned __seq; \ \ while (unlikely((__seq = seqprop_sequence(s)) & 1)) \ cpu_relax(); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ __seq; \ }) /** * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Return: count to be passed to read_seqcount_retry() */ #define raw_read_seqcount_begin(s) __read_seqcount_begin(s) /** * read_seqcount_begin() - begin a seqcount_t read critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Return: count to be passed to read_seqcount_retry() */ #define read_seqcount_begin(s) \ ({ \ seqcount_lockdep_reader_access(seqprop_const_ptr(s)); \ raw_read_seqcount_begin(s); \ }) /** * raw_read_seqcount() - read the raw seqcount_t counter value * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * raw_read_seqcount opens a read critical section of the given * seqcount_t, without any lockdep checking, and without checking or * masking the sequence counter LSB. Calling code is responsible for * handling that. * * Return: count to be passed to read_seqcount_retry() */ #define raw_read_seqcount(s) \ ({ \ unsigned __seq = seqprop_sequence(s); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ __seq; \ }) /** * raw_seqcount_try_begin() - begin a seqcount_t read critical section * w/o lockdep and w/o counter stabilization * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @start: count to be passed to read_seqcount_retry() * * Similar to raw_seqcount_begin(), except it enables eliding the critical * section entirely if odd, instead of doing the speculation knowing it will * fail. * * Useful when counter stabilization is more or less equivalent to taking * the lock and there is a slowpath that does that. * * If true, start will be set to the (even) sequence count read. * * Return: true when a read critical section is started. */ #define raw_seqcount_try_begin(s, start) \ ({ \ start = raw_read_seqcount(s); \ !(start & 1); \ }) /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * raw_seqcount_begin opens a read critical section of the given * seqcount_t. Unlike read_seqcount_begin(), this function will not wait * for the count to stabilize. If a writer is active when it begins, it * will fail the read_seqcount_retry() at the end of the read critical * section instead of stabilizing at the beginning of it. * * Use this only in special kernel hot paths where the read section is * small and has a high probability of success through other external * means. It will save a single branching instruction. * * Return: count to be passed to read_seqcount_retry() */ #define raw_seqcount_begin(s) \ ({ \ /* \ * If the counter is odd, let read_seqcount_retry() fail \ * by decrementing the counter. \ */ \ raw_read_seqcount(s) & ~1; \ }) /** * __read_seqcount_retry() - end a seqcount_t read section w/o barrier * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @start: count, from read_seqcount_begin() * * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb() * barrier. Callers should ensure that smp_rmb() or equivalent ordering is * provided before actually loading any of the variables that are to be * protected in this critical section. * * Use carefully, only in critical code, and comment how the barrier is * provided. * * Return: true if a read section retry is required, else false */ #define __read_seqcount_retry(s, start) \ do___read_seqcount_retry(seqprop_const_ptr(s), start) static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start) { kcsan_atomic_next(0); return unlikely(READ_ONCE(s->sequence) != start); } /** * read_seqcount_retry() - end a seqcount_t read critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @start: count, from read_seqcount_begin() * * read_seqcount_retry closes the read critical section of given * seqcount_t. If the critical section was invalid, it must be ignored * (and typically retried). * * Return: true if a read section retry is required, else false */ #define read_seqcount_retry(s, start) \ do_read_seqcount_retry(seqprop_const_ptr(s), start) static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start) { smp_rmb(); return do___read_seqcount_retry(s, start); } /** * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: check write_seqcount_begin() */ #define raw_write_seqcount_begin(s) \ do { \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ do_raw_write_seqcount_begin(seqprop_ptr(s)); \ } while (0) static inline void do_raw_write_seqcount_begin(seqcount_t *s) { kcsan_nestable_atomic_begin(); s->sequence++; smp_wmb(); } /** * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: check write_seqcount_end() */ #define raw_write_seqcount_end(s) \ do { \ do_raw_write_seqcount_end(seqprop_ptr(s)); \ \ if (seqprop_preemptible(s)) \ preempt_enable(); \ } while (0) static inline void do_raw_write_seqcount_end(seqcount_t *s) { smp_wmb(); s->sequence++; kcsan_nestable_atomic_end(); } /** * write_seqcount_begin_nested() - start a seqcount_t write section with * custom lockdep nesting level * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @subclass: lockdep nesting level * * See Documentation/locking/lockdep-design.rst * Context: check write_seqcount_begin() */ #define write_seqcount_begin_nested(s, subclass) \ do { \ seqprop_assert(s); \ \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ do_write_seqcount_begin_nested(seqprop_ptr(s), subclass); \ } while (0) static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass) { seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); do_raw_write_seqcount_begin(s); } /** * write_seqcount_begin() - start a seqcount_t write side critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: sequence counter write side sections must be serialized and * non-preemptible. Preemption will be automatically disabled if and * only if the seqcount write serialization lock is associated, and * preemptible. If readers can be invoked from hardirq or softirq * context, interrupts or bottom halves must be respectively disabled. */ #define write_seqcount_begin(s) \ do { \ seqprop_assert(s); \ \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ do_write_seqcount_begin(seqprop_ptr(s)); \ } while (0) static inline void do_write_seqcount_begin(seqcount_t *s) { do_write_seqcount_begin_nested(s, 0); } /** * write_seqcount_end() - end a seqcount_t write side critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: Preemption will be automatically re-enabled if and only if * the seqcount write serialization lock is associated, and preemptible. */ #define write_seqcount_end(s) \ do { \ do_write_seqcount_end(seqprop_ptr(s)); \ \ if (seqprop_preemptible(s)) \ preempt_enable(); \ } while (0) static inline void do_write_seqcount_end(seqcount_t *s) { seqcount_release(&s->dep_map, _RET_IP_); do_raw_write_seqcount_end(s); } /** * raw_write_seqcount_barrier() - do a seqcount_t write barrier * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * This can be used to provide an ordering guarantee instead of the usual * consistency guarantee. It is one wmb cheaper, because it can collapse * the two back-to-back wmb()s. * * Note that writes surrounding the barrier should be declared atomic (e.g. * via WRITE_ONCE): a) to ensure the writes become visible to other threads * atomically, avoiding compiler optimizations; b) to document which writes are * meant to propagate to the reader critical section. This is necessary because * neither writes before nor after the barrier are enclosed in a seq-writer * critical section that would ensure readers are aware of ongoing writes:: * * seqcount_t seq; * bool X = true, Y = false; * * void read(void) * { * bool x, y; * * do { * int s = read_seqcount_begin(&seq); * * x = X; y = Y; * * } while (read_seqcount_retry(&seq, s)); * * BUG_ON(!x && !y); * } * * void write(void) * { * WRITE_ONCE(Y, true); * * raw_write_seqcount_barrier(seq); * * WRITE_ONCE(X, false); * } */ #define raw_write_seqcount_barrier(s) \ do_raw_write_seqcount_barrier(seqprop_ptr(s)) static inline void do_raw_write_seqcount_barrier(seqcount_t *s) { kcsan_nestable_atomic_begin(); s->sequence++; smp_wmb(); s->sequence++; kcsan_nestable_atomic_end(); } /** * write_seqcount_invalidate() - invalidate in-progress seqcount_t read * side operations * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * After write_seqcount_invalidate, no seqcount_t read side operations * will complete successfully and see data older than this. */ #define write_seqcount_invalidate(s) \ do_write_seqcount_invalidate(seqprop_ptr(s)) static inline void do_write_seqcount_invalidate(seqcount_t *s) { smp_wmb(); kcsan_nestable_atomic_begin(); s->sequence+=2; kcsan_nestable_atomic_end(); } /* * Latch sequence counters (seqcount_latch_t) * * A sequence counter variant where the counter even/odd value is used to * switch between two copies of protected data. This allows the read path, * typically NMIs, to safely interrupt the write side critical section. * * As the write sections are fully preemptible, no special handling for * PREEMPT_RT is needed. */ typedef struct { seqcount_t seqcount; } seqcount_latch_t; /** * SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t * @seq_name: Name of the seqcount_latch_t instance */ #define SEQCNT_LATCH_ZERO(seq_name) { \ .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ } /** * seqcount_latch_init() - runtime initializer for seqcount_latch_t * @s: Pointer to the seqcount_latch_t instance */ #define seqcount_latch_init(s) seqcount_init(&(s)->seqcount) /** * raw_read_seqcount_latch() - pick even/odd latch data copy * @s: Pointer to seqcount_latch_t * * See raw_write_seqcount_latch() for details and a full reader/writer * usage example. * * Return: sequence counter raw value. Use the lowest bit as an index for * picking which data copy to read. The full counter must then be checked * with raw_read_seqcount_latch_retry(). */ static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) { /* * Pairs with the first smp_wmb() in raw_write_seqcount_latch(). * Due to the dependent load, a full smp_rmb() is not needed. */ return READ_ONCE(s->seqcount.sequence); } /** * read_seqcount_latch() - pick even/odd latch data copy * @s: Pointer to seqcount_latch_t * * See write_seqcount_latch() for details and a full reader/writer usage * example. * * Return: sequence counter raw value. Use the lowest bit as an index for * picking which data copy to read. The full counter must then be checked * with read_seqcount_latch_retry(). */ static __always_inline unsigned read_seqcount_latch(const seqcount_latch_t *s) { kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); return raw_read_seqcount_latch(s); } /** * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section * @s: Pointer to seqcount_latch_t * @start: count, from raw_read_seqcount_latch() * * Return: true if a read section retry is required, else false */ static __always_inline int raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) { smp_rmb(); return unlikely(READ_ONCE(s->seqcount.sequence) != start); } /** * read_seqcount_latch_retry() - end a seqcount_latch_t read section * @s: Pointer to seqcount_latch_t * @start: count, from read_seqcount_latch() * * Return: true if a read section retry is required, else false */ static __always_inline int read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) { kcsan_atomic_next(0); return raw_read_seqcount_latch_retry(s, start); } /** * raw_write_seqcount_latch() - redirect latch readers to even/odd copy * @s: Pointer to seqcount_latch_t */ static __always_inline void raw_write_seqcount_latch(seqcount_latch_t *s) { smp_wmb(); /* prior stores before incrementing "sequence" */ s->seqcount.sequence++; smp_wmb(); /* increment "sequence" before following stores */ } /** * write_seqcount_latch_begin() - redirect latch readers to odd copy * @s: Pointer to seqcount_latch_t * * The latch technique is a multiversion concurrency control method that allows * queries during non-atomic modifications. If you can guarantee queries never * interrupt the modification -- e.g. the concurrency is strictly between CPUs * -- you most likely do not need this. * * Where the traditional RCU/lockless data structures rely on atomic * modifications to ensure queries observe either the old or the new state the * latch allows the same for non-atomic updates. The trade-off is doubling the * cost of storage; we have to maintain two copies of the entire data * structure. * * Very simply put: we first modify one copy and then the other. This ensures * there is always one copy in a stable state, ready to give us an answer. * * The basic form is a data structure like:: * * struct latch_struct { * seqcount_latch_t seq; * struct data_struct data[2]; * }; * * Where a modification, which is assumed to be externally serialized, does the * following:: * * void latch_modify(struct latch_struct *latch, ...) * { * write_seqcount_latch_begin(&latch->seq); * modify(latch->data[0], ...); * write_seqcount_latch(&latch->seq); * modify(latch->data[1], ...); * write_seqcount_latch_end(&latch->seq); * } * * The query will have a form like:: * * struct entry *latch_query(struct latch_struct *latch, ...) * { * struct entry *entry; * unsigned seq, idx; * * do { * seq = read_seqcount_latch(&latch->seq); * * idx = seq & 0x01; * entry = data_query(latch->data[idx], ...); * * // This includes needed smp_rmb() * } while (read_seqcount_latch_retry(&latch->seq, seq)); * * return entry; * } * * So during the modification, queries are first redirected to data[1]. Then we * modify data[0]. When that is complete, we redirect queries back to data[0] * and we can modify data[1]. * * NOTE: * * The non-requirement for atomic modifications does _NOT_ include * the publishing of new entries in the case where data is a dynamic * data structure. * * An iteration might start in data[0] and get suspended long enough * to miss an entire modification sequence, once it resumes it might * observe the new entry. * * NOTE2: * * When data is a dynamic data structure; one should use regular RCU * patterns to manage the lifetimes of the objects within. */ static __always_inline void write_seqcount_latch_begin(seqcount_latch_t *s) { kcsan_nestable_atomic_begin(); raw_write_seqcount_latch(s); } /** * write_seqcount_latch() - redirect latch readers to even copy * @s: Pointer to seqcount_latch_t */ static __always_inline void write_seqcount_latch(seqcount_latch_t *s) { raw_write_seqcount_latch(s); } /** * write_seqcount_latch_end() - end a seqcount_latch_t write section * @s: Pointer to seqcount_latch_t * * Marks the end of a seqcount_latch_t writer section, after all copies of the * latch-protected data have been updated. */ static __always_inline void write_seqcount_latch_end(seqcount_latch_t *s) { kcsan_nestable_atomic_end(); } #define __SEQLOCK_UNLOCKED(lockname) \ { \ .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \ .lock = __SPIN_LOCK_UNLOCKED(lockname) \ } /** * seqlock_init() - dynamic initializer for seqlock_t * @sl: Pointer to the seqlock_t instance */ #define seqlock_init(sl) \ do { \ spin_lock_init(&(sl)->lock); \ seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock); \ } while (0) /** * DEFINE_SEQLOCK(sl) - Define a statically allocated seqlock_t * @sl: Name of the seqlock_t instance */ #define DEFINE_SEQLOCK(sl) \ seqlock_t sl = __SEQLOCK_UNLOCKED(sl) /** * read_seqbegin() - start a seqlock_t read side critical section * @sl: Pointer to seqlock_t * * Return: count, to be passed to read_seqretry() */ static inline unsigned read_seqbegin(const seqlock_t *sl) { return read_seqcount_begin(&sl->seqcount); } /** * read_seqretry() - end a seqlock_t read side section * @sl: Pointer to seqlock_t * @start: count, from read_seqbegin() * * read_seqretry closes the read side critical section of given seqlock_t. * If the critical section was invalid, it must be ignored (and typically * retried). * * Return: true if a read section retry is required, else false */ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) { return read_seqcount_retry(&sl->seqcount, start); } /* * For all seqlock_t write side functions, use the internal * do_write_seqcount_begin() instead of generic write_seqcount_begin(). * This way, no redundant lockdep_assert_held() checks are added. */ /** * write_seqlock() - start a seqlock_t write side critical section * @sl: Pointer to seqlock_t * * write_seqlock opens a write side critical section for the given * seqlock_t. It also implicitly acquires the spinlock_t embedded inside * that sequential lock. All seqlock_t write side sections are thus * automatically serialized and non-preemptible. * * Context: if the seqlock_t read section, or other write side critical * sections, can be invoked from hardirq or softirq contexts, use the * _irqsave or _bh variants of this function instead. */ static inline void write_seqlock(seqlock_t *sl) { spin_lock(&sl->lock); do_write_seqcount_begin(&sl->seqcount.seqcount); } /** * write_sequnlock() - end a seqlock_t write side critical section * @sl: Pointer to seqlock_t * * write_sequnlock closes the (serialized and non-preemptible) write side * critical section of given seqlock_t. */ static inline void write_sequnlock(seqlock_t *sl) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock(&sl->lock); } /** * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section * @sl: Pointer to seqlock_t * * _bh variant of write_seqlock(). Use only if the read side section, or * other write side sections, can be invoked from softirq contexts. */ static inline void write_seqlock_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); do_write_seqcount_begin(&sl->seqcount.seqcount); } /** * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section * @sl: Pointer to seqlock_t * * write_sequnlock_bh closes the serialized, non-preemptible, and * softirqs-disabled, seqlock_t write side critical section opened with * write_seqlock_bh(). */ static inline void write_sequnlock_bh(seqlock_t *sl) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_bh(&sl->lock); } /** * write_seqlock_irq() - start a non-interruptible seqlock_t write section * @sl: Pointer to seqlock_t * * _irq variant of write_seqlock(). Use only if the read side section, or * other write sections, can be invoked from hardirq contexts. */ static inline void write_seqlock_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); do_write_seqcount_begin(&sl->seqcount.seqcount); } /** * write_sequnlock_irq() - end a non-interruptible seqlock_t write section * @sl: Pointer to seqlock_t * * write_sequnlock_irq closes the serialized and non-interruptible * seqlock_t write side section opened with write_seqlock_irq(). */ static inline void write_sequnlock_irq(seqlock_t *sl) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_irq(&sl->lock); } static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) { unsigned long flags; spin_lock_irqsave(&sl->lock, flags); do_write_seqcount_begin(&sl->seqcount.seqcount); return flags; } /** * write_seqlock_irqsave() - start a non-interruptible seqlock_t write * section * @lock: Pointer to seqlock_t * @flags: Stack-allocated storage for saving caller's local interrupt * state, to be passed to write_sequnlock_irqrestore(). * * _irqsave variant of write_seqlock(). Use it only if the read side * section, or other write sections, can be invoked from hardirq context. */ #define write_seqlock_irqsave(lock, flags) \ do { flags = __write_seqlock_irqsave(lock); } while (0) /** * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write * section * @sl: Pointer to seqlock_t * @flags: Caller's saved interrupt state, from write_seqlock_irqsave() * * write_sequnlock_irqrestore closes the serialized and non-interruptible * seqlock_t write section previously opened with write_seqlock_irqsave(). */ static inline void write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_irqrestore(&sl->lock, flags); } /** * read_seqlock_excl() - begin a seqlock_t locking reader section * @sl: Pointer to seqlock_t * * read_seqlock_excl opens a seqlock_t locking reader critical section. A * locking reader exclusively locks out *both* other writers *and* other * locking readers, but it does not update the embedded sequence number. * * Locking readers act like a normal spin_lock()/spin_unlock(). * * Context: if the seqlock_t write section, *or other read sections*, can * be invoked from hardirq or softirq contexts, use the _irqsave or _bh * variant of this function instead. * * The opened read section must be closed with read_sequnlock_excl(). */ static inline void read_seqlock_excl(seqlock_t *sl) { spin_lock(&sl->lock); } /** * read_sequnlock_excl() - end a seqlock_t locking reader critical section * @sl: Pointer to seqlock_t */ static inline void read_sequnlock_excl(seqlock_t *sl) { spin_unlock(&sl->lock); } /** * read_seqlock_excl_bh() - start a seqlock_t locking reader section with * softirqs disabled * @sl: Pointer to seqlock_t * * _bh variant of read_seqlock_excl(). Use this variant only if the * seqlock_t write side section, *or other read sections*, can be invoked * from softirq contexts. */ static inline void read_seqlock_excl_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); } /** * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking * reader section * @sl: Pointer to seqlock_t */ static inline void read_sequnlock_excl_bh(seqlock_t *sl) { spin_unlock_bh(&sl->lock); } /** * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking * reader section * @sl: Pointer to seqlock_t * * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t * write side section, *or other read sections*, can be invoked from a * hardirq context. */ static inline void read_seqlock_excl_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); } /** * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t * locking reader section * @sl: Pointer to seqlock_t */ static inline void read_sequnlock_excl_irq(seqlock_t *sl) { spin_unlock_irq(&sl->lock); } static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl) { unsigned long flags; spin_lock_irqsave(&sl->lock, flags); return flags; } /** * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t * locking reader section * @lock: Pointer to seqlock_t * @flags: Stack-allocated storage for saving caller's local interrupt * state, to be passed to read_sequnlock_excl_irqrestore(). * * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t * write side section, *or other read sections*, can be invoked from a * hardirq context. */ #define read_seqlock_excl_irqsave(lock, flags) \ do { flags = __read_seqlock_excl_irqsave(lock); } while (0) /** * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t * locking reader section * @sl: Pointer to seqlock_t * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave() */ static inline void read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags) { spin_unlock_irqrestore(&sl->lock, flags); } /** * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader * @lock: Pointer to seqlock_t * @seq : Marker and return parameter. If the passed value is even, the * reader will become a *lockless* seqlock_t reader as in read_seqbegin(). * If the passed value is odd, the reader will become a *locking* reader * as in read_seqlock_excl(). In the first call to this function, the * caller *must* initialize and pass an even value to @seq; this way, a * lockless read can be optimistically tried first. * * read_seqbegin_or_lock is an API designed to optimistically try a normal * lockless seqlock_t read section first. If an odd counter is found, the * lockless read trial has failed, and the next read iteration transforms * itself into a full seqlock_t locking reader. * * This is typically used to avoid seqlock_t lockless readers starvation * (too much retry loops) in the case of a sharp spike in write side * activity. * * Context: if the seqlock_t write section, *or other read sections*, can * be invoked from hardirq or softirq contexts, use the _irqsave or _bh * variant of this function instead. * * Check Documentation/locking/seqlock.rst for template example code. * * Return: the encountered sequence counter value, through the @seq * parameter, which is overloaded as a return parameter. This returned * value must be checked with need_seqretry(). If the read section need to * be retried, this returned value must also be passed as the @seq * parameter of the next read_seqbegin_or_lock() iteration. */ static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) { if (!(*seq & 1)) /* Even */ *seq = read_seqbegin(lock); else /* Odd */ read_seqlock_excl(lock); } /** * need_seqretry() - validate seqlock_t "locking or lockless" read section * @lock: Pointer to seqlock_t * @seq: sequence count, from read_seqbegin_or_lock() * * Return: true if a read section retry is required, false otherwise */ static inline int need_seqretry(seqlock_t *lock, int seq) { return !(seq & 1) && read_seqretry(lock, seq); } /** * done_seqretry() - end seqlock_t "locking or lockless" reader section * @lock: Pointer to seqlock_t * @seq: count, from read_seqbegin_or_lock() * * done_seqretry finishes the seqlock_t read side critical section started * with read_seqbegin_or_lock() and validated by need_seqretry(). */ static inline void done_seqretry(seqlock_t *lock, int seq) { if (seq & 1) read_sequnlock_excl(lock); } /** * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or * a non-interruptible locking reader * @lock: Pointer to seqlock_t * @seq: Marker and return parameter. Check read_seqbegin_or_lock(). * * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if * the seqlock_t write section, *or other read sections*, can be invoked * from hardirq context. * * Note: Interrupts will be disabled only for "locking reader" mode. * * Return: * * 1. The saved local interrupts state in case of a locking reader, to * be passed to done_seqretry_irqrestore(). * * 2. The encountered sequence counter value, returned through @seq * overloaded as a return parameter. Check read_seqbegin_or_lock(). */ static inline unsigned long read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq) { unsigned long flags = 0; if (!(*seq & 1)) /* Even */ *seq = read_seqbegin(lock); else /* Odd */ read_seqlock_excl_irqsave(lock, flags); return flags; } /** * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a * non-interruptible locking reader section * @lock: Pointer to seqlock_t * @seq: Count, from read_seqbegin_or_lock_irqsave() * @flags: Caller's saved local interrupt state in case of a locking * reader, also from read_seqbegin_or_lock_irqsave() * * This is the _irqrestore variant of done_seqretry(). The read section * must've been opened with read_seqbegin_or_lock_irqsave(), and validated * by need_seqretry(). */ static inline void done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) { if (seq & 1) read_sequnlock_excl_irqrestore(lock, flags); } #endif /* __LINUX_SEQLOCK_H */ |
211 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ #ifndef _NET_BATMAN_ADV_MESH_INTERFACE_H_ #define _NET_BATMAN_ADV_MESH_INTERFACE_H_ #include "main.h" #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/rtnetlink.h> int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *mesh_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node); bool batadv_meshif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_meshif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); void batadv_meshif_vlan_release(struct kref *ref); struct batadv_meshif_vlan *batadv_meshif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid); /** * batadv_meshif_vlan_put() - decrease the vlan object refcounter and * possibly release it * @vlan: the vlan object to release */ static inline void batadv_meshif_vlan_put(struct batadv_meshif_vlan *vlan) { if (!vlan) return; kref_put(&vlan->refcount, batadv_meshif_vlan_release); } #endif /* _NET_BATMAN_ADV_MESH_INTERFACE_H_ */ |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MLD_H #define LINUX_MLD_H #include <linux/in6.h> #include <linux/icmpv6.h> /* MLDv1 Query/Report/Done */ struct mld_msg { struct icmp6hdr mld_hdr; struct in6_addr mld_mca; }; #define mld_type mld_hdr.icmp6_type #define mld_code mld_hdr.icmp6_code #define mld_cksum mld_hdr.icmp6_cksum #define mld_maxdelay mld_hdr.icmp6_maxdelay #define mld_reserved mld_hdr.icmp6_dataun.un_data16[1] /* Multicast Listener Discovery version 2 headers */ /* MLDv2 Report */ struct mld2_grec { __u8 grec_type; __u8 grec_auxwords; __be16 grec_nsrcs; struct in6_addr grec_mca; struct in6_addr grec_src[]; }; struct mld2_report { struct icmp6hdr mld2r_hdr; struct mld2_grec mld2r_grec[]; }; #define mld2r_type mld2r_hdr.icmp6_type #define mld2r_resv1 mld2r_hdr.icmp6_code #define mld2r_cksum mld2r_hdr.icmp6_cksum #define mld2r_resv2 mld2r_hdr.icmp6_dataun.un_data16[0] #define mld2r_ngrec mld2r_hdr.icmp6_dataun.un_data16[1] /* MLDv2 Query */ struct mld2_query { struct icmp6hdr mld2q_hdr; struct in6_addr mld2q_mca; #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 mld2q_qrv:3, mld2q_suppress:1, mld2q_resv2:4; #elif defined(__BIG_ENDIAN_BITFIELD) __u8 mld2q_resv2:4, mld2q_suppress:1, mld2q_qrv:3; #else #error "Please fix <asm/byteorder.h>" #endif __u8 mld2q_qqic; __be16 mld2q_nsrcs; struct in6_addr mld2q_srcs[]; }; #define mld2q_type mld2q_hdr.icmp6_type #define mld2q_code mld2q_hdr.icmp6_code #define mld2q_cksum mld2q_hdr.icmp6_cksum #define mld2q_mrc mld2q_hdr.icmp6_maxdelay #define mld2q_resv1 mld2q_hdr.icmp6_dataun.un_data16[1] /* RFC3810, 5.1.3. Maximum Response Code: * * If Maximum Response Code >= 32768, Maximum Response Code represents a * floating-point value as follows: * * 0 1 2 3 4 5 6 7 8 9 A B C D E F * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ #define MLDV2_MRC_EXP(value) (((value) >> 12) & 0x0007) #define MLDV2_MRC_MAN(value) ((value) & 0x0fff) /* RFC3810, 5.1.9. QQIC (Querier's Query Interval Code): * * If QQIC >= 128, QQIC represents a floating-point value as follows: * * 0 1 2 3 4 5 6 7 * +-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+ */ #define MLDV2_QQIC_EXP(value) (((value) >> 4) & 0x07) #define MLDV2_QQIC_MAN(value) ((value) & 0x0f) #define MLD_EXP_MIN_LIMIT 32768UL #define MLDV1_MRD_MAX_COMPAT (MLD_EXP_MIN_LIMIT - 1) #define MLD_MAX_QUEUE 8 #define MLD_MAX_SKBS 32 static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2) { /* RFC3810, 5.1.3. Maximum Response Code */ unsigned long ret, mc_mrc = ntohs(mlh2->mld2q_mrc); if (mc_mrc < MLD_EXP_MIN_LIMIT) { ret = mc_mrc; } else { unsigned long mc_man, mc_exp; mc_exp = MLDV2_MRC_EXP(mc_mrc); mc_man = MLDV2_MRC_MAN(mc_mrc); ret = (mc_man | 0x1000) << (mc_exp + 3); } return ret; } #endif |
25 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | /* * net/tipc/eth_media.c: Ethernet bearer support for TIPC * * Copyright (c) 2001-2007, 2013-2014, Ericsson AB * Copyright (c) 2005-2008, 2011-2013, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "bearer.h" /* Convert Ethernet address (media address format) to string */ static int tipc_eth_addr2str(struct tipc_media_addr *addr, char *strbuf, int bufsz) { if (bufsz < 18) /* 18 = strlen("aa:bb:cc:dd:ee:ff\0") */ return 1; sprintf(strbuf, "%pM", addr->value); return 0; } /* Convert from media address format to discovery message addr format */ static int tipc_eth_addr2msg(char *msg, struct tipc_media_addr *addr) { memset(msg, 0, TIPC_MEDIA_INFO_SIZE); msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_ETH; memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, addr->value, ETH_ALEN); return 0; } /* Convert raw mac address format to media addr format */ static int tipc_eth_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, const char *msg) { memset(addr, 0, sizeof(*addr)); ether_addr_copy(addr->value, msg); addr->media_id = TIPC_MEDIA_TYPE_ETH; addr->broadcast = is_broadcast_ether_addr(addr->value); return 0; } /* Convert discovery msg addr format to Ethernet media addr format */ static int tipc_eth_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, char *msg) { /* Skip past preamble: */ msg += TIPC_MEDIA_ADDR_OFFSET; return tipc_eth_raw2addr(b, addr, msg); } /* Ethernet media registration info */ struct tipc_media eth_media_info = { .send_msg = tipc_l2_send_msg, .enable_media = tipc_enable_l2_media, .disable_media = tipc_disable_l2_media, .addr2str = tipc_eth_addr2str, .addr2msg = tipc_eth_addr2msg, .msg2addr = tipc_eth_msg2addr, .raw2addr = tipc_eth_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .min_win = TIPC_DEF_LINK_WIN, .max_win = TIPC_MAX_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_ETH, .hwaddr_len = ETH_ALEN, .name = "eth" }; |
400 1 25 25 12 777 777 603 376 28 346 8 95 2884 188 2629 4 92 1671 1405 3166 30119 4260 19419 1309 20272 2490 3423 715 47 6 6 5 7 54 31 390 45 206 2594 1814 544 3 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 | /* SPDX-License-Identifier: GPL-2.0 */ /* * net/dst.h Protocol independent destination cache definitions. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * */ #ifndef _NET_DST_H #define _NET_DST_H #include <net/dst_ops.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/rcupdate.h> #include <linux/bug.h> #include <linux/jiffies.h> #include <linux/refcount.h> #include <linux/rcuref.h> #include <net/neighbour.h> #include <asm/processor.h> #include <linux/indirect_call_wrapper.h> struct sk_buff; struct dst_entry { struct net_device *dev; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; #ifdef CONFIG_XFRM struct xfrm_state *xfrm; #else void *__pad1; #endif int (*input)(struct sk_buff *); int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); unsigned short flags; #define DST_NOXFRM 0x0002 #define DST_NOPOLICY 0x0004 #define DST_NOCOUNT 0x0008 #define DST_FAKE_RTABLE 0x0010 #define DST_XFRM_TUNNEL 0x0020 #define DST_XFRM_QUEUE 0x0040 #define DST_METADATA 0x0080 /* A non-zero value of dst->obsolete forces by-hand validation * of the route entry. Positive values are set by the generic * dst layer to indicate that the entry has been forcefully * destroyed. * * Negative values are used by the implementation layer code to * force invocation of the dst_ops->check() method. */ short obsolete; #define DST_OBSOLETE_NONE 0 #define DST_OBSOLETE_DEAD 2 #define DST_OBSOLETE_FORCE_CHK -1 #define DST_OBSOLETE_KILL -2 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ /* * __rcuref wants to be on a different cache line from * input/output/ops or performance tanks badly */ #ifdef CONFIG_64BIT rcuref_t __rcuref; /* 64-bit offset 64 */ #endif int __use; unsigned long lastuse; struct rcu_head rcu_head; short error; short __pad; __u32 tclassid; #ifndef CONFIG_64BIT struct lwtunnel_state *lwtstate; rcuref_t __rcuref; /* 32-bit offset 64 */ #endif netdevice_tracker dev_tracker; /* * Used by rtable and rt6_info. Moves lwtstate into the next cache * line on 64bit so that lwtstate does not cause false sharing with * __rcuref under contention of __rcuref. This also puts the * frequently accessed members of rtable and rt6_info out of the * __rcuref cache line. */ struct list_head rt_uncached; struct uncached_list *rt_uncached_list; #ifdef CONFIG_64BIT struct lwtunnel_state *lwtstate; #endif }; struct dst_metrics { u32 metrics[RTAX_MAX]; refcount_t refcnt; } __aligned(4); /* Low pointer bits contain DST_METRICS_FLAGS */ extern const struct dst_metrics dst_default_metrics; u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); #define DST_METRICS_READ_ONLY 0x1UL #define DST_METRICS_REFCOUNTED 0x2UL #define DST_METRICS_FLAGS 0x3UL #define __DST_METRICS_PTR(Y) \ ((u32 *)((Y) & ~DST_METRICS_FLAGS)) #define DST_METRICS_PTR(X) __DST_METRICS_PTR((X)->_metrics) static inline bool dst_metrics_read_only(const struct dst_entry *dst) { return dst->_metrics & DST_METRICS_READ_ONLY; } void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old); static inline void dst_destroy_metrics_generic(struct dst_entry *dst) { unsigned long val = dst->_metrics; if (!(val & DST_METRICS_READ_ONLY)) __dst_destroy_metrics_generic(dst, val); } static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst) { unsigned long p = dst->_metrics; BUG_ON(!p); if (p & DST_METRICS_READ_ONLY) return dst->ops->cow_metrics(dst, p); return __DST_METRICS_PTR(p); } /* This may only be invoked before the entry has reached global * visibility. */ static inline void dst_init_metrics(struct dst_entry *dst, const u32 *src_metrics, bool read_only) { dst->_metrics = ((unsigned long) src_metrics) | (read_only ? DST_METRICS_READ_ONLY : 0); } static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src) { u32 *dst_metrics = dst_metrics_write_ptr(dest); if (dst_metrics) { u32 *src_metrics = DST_METRICS_PTR(src); memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32)); } } static inline u32 *dst_metrics_ptr(struct dst_entry *dst) { return DST_METRICS_PTR(dst); } static inline u32 dst_metric_raw(const struct dst_entry *dst, const int metric) { u32 *p = DST_METRICS_PTR(dst); return p[metric-1]; } static inline u32 dst_metric(const struct dst_entry *dst, const int metric) { WARN_ON_ONCE(metric == RTAX_HOPLIMIT || metric == RTAX_ADVMSS || metric == RTAX_MTU); return dst_metric_raw(dst, metric); } static inline u32 dst_metric_advmss(const struct dst_entry *dst) { u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS); if (!advmss) advmss = dst->ops->default_advmss(dst); return advmss; } static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val) { u32 *p = dst_metrics_write_ptr(dst); if (p) p[metric-1] = val; } /* Kernel-internal feature bits that are unallocated in user space. */ #define DST_FEATURE_ECN_CA (1U << 31) #define DST_FEATURE_MASK (DST_FEATURE_ECN_CA) #define DST_FEATURE_ECN_MASK (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN) static inline u32 dst_feature(const struct dst_entry *dst, u32 feature) { return dst_metric(dst, RTAX_FEATURES) & feature; } INDIRECT_CALLABLE_DECLARE(unsigned int ip6_mtu(const struct dst_entry *)); INDIRECT_CALLABLE_DECLARE(unsigned int ipv4_mtu(const struct dst_entry *)); static inline u32 dst_mtu(const struct dst_entry *dst) { return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst); } /* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric) { return msecs_to_jiffies(dst_metric(dst, metric)); } static inline int dst_metric_locked(const struct dst_entry *dst, int metric) { return dst_metric(dst, RTAX_LOCK) & (1 << metric); } static inline void dst_hold(struct dst_entry *dst) { /* * If your kernel compilation stops here, please check * the placement of __rcuref in struct dst_entry */ BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63); WARN_ON(!rcuref_get(&dst->__rcuref)); } static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { if (unlikely(time != dst->lastuse)) { dst->__use++; dst->lastuse = time; } } static inline struct dst_entry *dst_clone(struct dst_entry *dst) { if (dst) dst_hold(dst); return dst; } void dst_release(struct dst_entry *dst); void dst_release_immediate(struct dst_entry *dst); static inline void refdst_drop(unsigned long refdst) { if (!(refdst & SKB_DST_NOREF)) dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK)); } /** * skb_dst_drop - drops skb dst * @skb: buffer * * Drops dst reference count if a reference was taken. */ static inline void skb_dst_drop(struct sk_buff *skb) { if (skb->_skb_refdst) { refdst_drop(skb->_skb_refdst); skb->_skb_refdst = 0UL; } } static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst) { nskb->slow_gro |= !!refdst; nskb->_skb_refdst = refdst; if (!(nskb->_skb_refdst & SKB_DST_NOREF)) dst_clone(skb_dst(nskb)); } static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb) { __skb_dst_copy(nskb, oskb->_skb_refdst); } /** * dst_hold_safe - Take a reference on a dst if possible * @dst: pointer to dst entry * * This helper returns false if it could not safely * take a reference on a dst. */ static inline bool dst_hold_safe(struct dst_entry *dst) { return rcuref_get(&dst->__rcuref); } /** * skb_dst_force - makes sure skb dst is refcounted * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. * Returns: true if dst is refcounted. */ static inline bool skb_dst_force(struct sk_buff *skb) { if (skb_dst_is_noref(skb)) { struct dst_entry *dst = skb_dst(skb); WARN_ON(!rcu_read_lock_held()); if (!dst_hold_safe(dst)) dst = NULL; skb->_skb_refdst = (unsigned long)dst; skb->slow_gro |= !!dst; } return skb->_skb_refdst != 0UL; } /** * __skb_tunnel_rx - prepare skb for rx reinsert * @skb: buffer * @dev: tunnel device * @net: netns for packet i/o * * After decapsulation, packet is going to re-enter (netif_rx()) our stack, * so make some cleanups. (no accounting done) */ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, struct net *net) { skb->dev = dev; /* * Clear hash so that we can recalculate the hash for the * encapsulated packet, unless we have already determine the hash * over the L4 4-tuple. */ skb_clear_hash_if_not_l4(skb); skb_set_queue_mapping(skb, 0); skb_scrub_packet(skb, !net_eq(net, dev_net(dev))); } /** * skb_tunnel_rx - prepare skb for rx reinsert * @skb: buffer * @dev: tunnel device * @net: netns for packet i/o * * After decapsulation, packet is going to re-enter (netif_rx()) our stack, * so make some cleanups, and perform accounting. * Note: this accounting is not SMP safe. */ static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, struct net *net) { DEV_STATS_INC(dev, rx_packets); DEV_STATS_ADD(dev, rx_bytes, skb->len); __skb_tunnel_rx(skb, dev, net); } static inline u32 dst_tclassid(const struct sk_buff *skb) { #ifdef CONFIG_IP_ROUTE_CLASSID const struct dst_entry *dst; dst = skb_dst(skb); if (dst) return dst->tclassid; #endif return 0; } int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static inline int dst_discard(struct sk_buff *skb) { return dst_discard_out(&init_net, skb->sk, skb); } void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags); void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags); void dst_dev_put(struct dst_entry *dst); static inline void dst_confirm(struct dst_entry *dst) { } static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr) { struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr); return IS_ERR(n) ? NULL : n; } static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst, struct sk_buff *skb) { struct neighbour *n; if (WARN_ON_ONCE(!dst->ops->neigh_lookup)) return NULL; n = dst->ops->neigh_lookup(dst, skb, NULL); return IS_ERR(n) ? NULL : n; } static inline void dst_confirm_neigh(const struct dst_entry *dst, const void *daddr) { if (dst->ops->confirm_neigh) dst->ops->confirm_neigh(dst, daddr); } static inline void dst_link_failure(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops && dst->ops->link_failure) dst->ops->link_failure(skb); } static inline void dst_set_expires(struct dst_entry *dst, int timeout) { unsigned long expires = jiffies + timeout; if (expires == 0) expires = 1; if (dst->expires == 0 || time_before(expires, dst->expires)) dst->expires = expires; } static inline unsigned int dst_dev_overhead(struct dst_entry *dst, struct sk_buff *skb) { if (likely(dst)) return LL_RESERVED_SPACE(dst->dev); return skb->mac_len; } INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, struct sk_buff *)); /* Output packet to network from transport. */ static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return INDIRECT_CALL_INET(skb_dst(skb)->output, ip6_output, ip_output, net, sk, skb); } INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *)); /* Input packet from network to transport. */ static inline int dst_input(struct sk_buff *skb) { return INDIRECT_CALL_INET(skb_dst(skb)->input, ip6_input, ip_local_deliver, skb); } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, u32)); INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) { if (dst->obsolete) dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie); return dst; } /* Flags for xfrm_lookup flags argument. */ enum { XFRM_LOOKUP_ICMP = 1 << 0, XFRM_LOOKUP_QUEUE = 1 << 1, XFRM_LOOKUP_KEEP_DST_REF = 1 << 2, }; struct flowi; #ifndef CONFIG_XFRM static inline struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return dst_orig; } static inline struct dst_entry * xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id) { return dst_orig; } static inline struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return dst_orig; } static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) { return NULL; } #else struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); struct dst_entry *xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id); struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags); /* skb attached with this dst needs transformation if dst->xfrm is valid */ static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) { return dst->xfrm; } #endif static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops->update_pmtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, true); } /* update dst pmtu but not do neighbor confirm */ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) { struct dst_entry *dst = skb_dst(skb); if (dst && dst->ops->update_pmtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old); struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); unsigned int dst_blackhole_mtu(const struct dst_entry *dst); #endif /* _NET_DST_H */ |
58 19 31 9 48 9 18 44 4 6 2 3 5 22 32 2 1 50 4 46 22 28 6 5 1 2 1 2 1 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2012-2014 Pablo Neira Ayuso <pablo@netfilter.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/audit.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/ipv6.h> #include <net/ip.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_log.h> #include <linux/netdevice.h> static const char *nft_log_null_prefix = ""; struct nft_log { struct nf_loginfo loginfo; char *prefix; }; static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) { struct iphdr _iph; const struct iphdr *ih; ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph); if (!ih) return false; audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu", &ih->saddr, &ih->daddr, ih->protocol); return true; } static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) { struct ipv6hdr _ip6h; const struct ipv6hdr *ih; u8 nexthdr; __be16 frag_off; ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); if (!ih) return false; nexthdr = ih->nexthdr; ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off); audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", &ih->saddr, &ih->daddr, nexthdr); return true; } static void nft_log_eval_audit(const struct nft_pktinfo *pkt) { struct sk_buff *skb = pkt->skb; struct audit_buffer *ab; int fam = -1; if (!audit_enabled) return; ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); if (!ab) return; audit_log_format(ab, "mark=%#x", skb->mark); switch (nft_pf(pkt)) { case NFPROTO_BRIDGE: switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; break; case htons(ETH_P_IPV6): fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; break; } break; case NFPROTO_IPV4: fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; break; case NFPROTO_IPV6: fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; break; } if (fam == -1) audit_log_format(ab, " saddr=? daddr=? proto=-1"); audit_log_end(ab); } static void nft_log_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_log *priv = nft_expr_priv(expr); if (priv->loginfo.type == NF_LOG_TYPE_LOG && priv->loginfo.u.log.level == NFT_LOGLEVEL_AUDIT) { nft_log_eval_audit(pkt); return; } nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb, nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s", priv->prefix); } static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { [NFTA_LOG_GROUP] = { .type = NLA_U16 }, [NFTA_LOG_PREFIX] = { .type = NLA_STRING, .len = NF_LOG_PREFIXLEN - 1 }, [NFTA_LOG_SNAPLEN] = { .type = NLA_U32 }, [NFTA_LOG_QTHRESHOLD] = { .type = NLA_U16 }, [NFTA_LOG_LEVEL] = { .type = NLA_U32 }, [NFTA_LOG_FLAGS] = { .type = NLA_U32 }, }; static int nft_log_modprobe(struct net *net, enum nf_log_type t) { switch (t) { case NF_LOG_TYPE_LOG: return nft_request_module(net, "%s", "nf_log_syslog"); case NF_LOG_TYPE_ULOG: return nft_request_module(net, "%s", "nfnetlink_log"); case NF_LOG_TYPE_MAX: break; } return -ENOENT; } static int nft_log_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_log *priv = nft_expr_priv(expr); struct nf_loginfo *li = &priv->loginfo; const struct nlattr *nla; int err; li->type = NF_LOG_TYPE_LOG; if (tb[NFTA_LOG_LEVEL] != NULL && tb[NFTA_LOG_GROUP] != NULL) return -EINVAL; if (tb[NFTA_LOG_GROUP] != NULL) { li->type = NF_LOG_TYPE_ULOG; if (tb[NFTA_LOG_FLAGS] != NULL) return -EINVAL; } nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL_ACCOUNT); if (priv->prefix == NULL) return -ENOMEM; nla_strscpy(priv->prefix, nla, nla_len(nla) + 1); } else { priv->prefix = (char *)nft_log_null_prefix; } switch (li->type) { case NF_LOG_TYPE_LOG: if (tb[NFTA_LOG_LEVEL] != NULL) { li->u.log.level = ntohl(nla_get_be32(tb[NFTA_LOG_LEVEL])); } else { li->u.log.level = NFT_LOGLEVEL_WARNING; } if (li->u.log.level > NFT_LOGLEVEL_AUDIT) { err = -EINVAL; goto err1; } if (tb[NFTA_LOG_FLAGS] != NULL) { li->u.log.logflags = ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS])); if (li->u.log.logflags & ~NF_LOG_MASK) { err = -EINVAL; goto err1; } } break; case NF_LOG_TYPE_ULOG: li->u.ulog.group = ntohs(nla_get_be16(tb[NFTA_LOG_GROUP])); if (tb[NFTA_LOG_SNAPLEN] != NULL) { li->u.ulog.flags |= NF_LOG_F_COPY_LEN; li->u.ulog.copy_len = ntohl(nla_get_be32(tb[NFTA_LOG_SNAPLEN])); } if (tb[NFTA_LOG_QTHRESHOLD] != NULL) { li->u.ulog.qthreshold = ntohs(nla_get_be16(tb[NFTA_LOG_QTHRESHOLD])); } break; } if (li->u.log.level == NFT_LOGLEVEL_AUDIT) return 0; err = nf_logger_find_get(ctx->family, li->type); if (err < 0) { if (nft_log_modprobe(ctx->net, li->type) == -EAGAIN) err = -EAGAIN; goto err1; } return 0; err1: if (priv->prefix != nft_log_null_prefix) kfree(priv->prefix); return err; } static void nft_log_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_log *priv = nft_expr_priv(expr); struct nf_loginfo *li = &priv->loginfo; if (priv->prefix != nft_log_null_prefix) kfree(priv->prefix); if (li->u.log.level == NFT_LOGLEVEL_AUDIT) return; nf_logger_put(ctx->family, li->type); } static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_log *priv = nft_expr_priv(expr); const struct nf_loginfo *li = &priv->loginfo; if (priv->prefix != nft_log_null_prefix) if (nla_put_string(skb, NFTA_LOG_PREFIX, priv->prefix)) goto nla_put_failure; switch (li->type) { case NF_LOG_TYPE_LOG: if (nla_put_be32(skb, NFTA_LOG_LEVEL, htonl(li->u.log.level))) goto nla_put_failure; if (li->u.log.logflags) { if (nla_put_be32(skb, NFTA_LOG_FLAGS, htonl(li->u.log.logflags))) goto nla_put_failure; } break; case NF_LOG_TYPE_ULOG: if (nla_put_be16(skb, NFTA_LOG_GROUP, htons(li->u.ulog.group))) goto nla_put_failure; if (li->u.ulog.flags & NF_LOG_F_COPY_LEN) { if (nla_put_be32(skb, NFTA_LOG_SNAPLEN, htonl(li->u.ulog.copy_len))) goto nla_put_failure; } if (li->u.ulog.qthreshold) { if (nla_put_be16(skb, NFTA_LOG_QTHRESHOLD, htons(li->u.ulog.qthreshold))) goto nla_put_failure; } break; } return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_log_type; static const struct nft_expr_ops nft_log_ops = { .type = &nft_log_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_log)), .eval = nft_log_eval, .init = nft_log_init, .destroy = nft_log_destroy, .dump = nft_log_dump, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_log_type __read_mostly = { .name = "log", .ops = &nft_log_ops, .policy = nft_log_policy, .maxattr = NFTA_LOG_MAX, .owner = THIS_MODULE, }; static int __init nft_log_module_init(void) { return nft_register_expr(&nft_log_type); } static void __exit nft_log_module_exit(void) { nft_unregister_expr(&nft_log_type); } module_init(nft_log_module_init); module_exit(nft_log_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS_NFT_EXPR("log"); MODULE_DESCRIPTION("Netfilter nf_tables log module"); |
12 13 1 1 196 189 13 1 156 155 6 151 126 124 4 1 4 2 146 116 5 136 2 12 12 2 8 164 4 1 2 1 4 5 168 4 124 20 10 169 2 6 165 5 168 169 169 168 22 22 145 169 40 135 135 134 135 134 134 128 25 17 135 36 37 4 6 6 4 10 9 4 8 19 5 21 2 13 26 29 10 41 40 41 44 24 44 34 23 23 23 44 40 20 40 30 19 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 | // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2011 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/gfp.h> #include <net/xfrm.h> #include <linux/siphash.h> #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <uapi/linux/netfilter/nf_nat.h> #include "nf_internals.h" #define NF_NAT_MAX_ATTEMPTS 128 #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; static siphash_aligned_key_t nf_nat_hash_rnd; struct nf_nat_lookup_hook_priv { struct nf_hook_entries __rcu *entries; struct rcu_head rcu_head; }; struct nf_nat_hooks_net { struct nf_hook_ops *nat_hook_ops; unsigned int users; }; struct nat_net { struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; }; #ifdef CONFIG_XFRM static void nf_nat_ipv4_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi4 *fl4 = &fl->u.ip4; if (ct->status & statusbit) { fl4->daddr = t->dst.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl4->saddr = t->src.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_sport = t->src.u.all; } } static void nf_nat_ipv6_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { #if IS_ENABLED(CONFIG_IPV6) const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi6 *fl6 = &fl->u.ip6; if (ct->status & statusbit) { fl6->daddr = t->dst.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl6->saddr = t->src.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_sport = t->src.u.all; } #endif } static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; unsigned long statusbit; u8 family; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return; family = nf_ct_l3num(ct); dir = CTINFO2DIR(ctinfo); if (dir == IP_CT_DIR_ORIGINAL) statusbit = IPS_DST_NAT; else statusbit = IPS_SRC_NAT; switch (family) { case NFPROTO_IPV4: nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); return; case NFPROTO_IPV6: nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); return; } } #endif /* CONFIG_XFRM */ /* We keep an extra hash for each conntrack, for fast searching. */ static unsigned int hash_by_src(const struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { unsigned int hash; struct { struct nf_conntrack_man src; u32 net_mix; u32 protonum; u32 zone; } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); memset(&combined, 0, sizeof(combined)); /* Original src, to ensure we map it consistently if poss. */ combined.src = tuple->src; combined.net_mix = net_hash_mix(net); combined.protonum = tuple->dst.protonum; /* Zone ID can be used provided its valid for both directions */ if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) combined.zone = zone->id; hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); } /** * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry * @tuple: proposed NAT binding * @ignored_conntrack: our (unconfirmed) conntrack entry * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple in either direction. * * Example: * INITIATOR -> NAT/PAT -> RESPONDER * * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite). * Then, later, NAT/PAT itself also connects to RESPONDER. * * This will not work if the SNAT done earlier has same IP:PORT source pair. * * Conntrack table has: * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT * * and new locally originating connection wants: * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT * * ... which would mean incoming packets cannot be distinguished between * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple). * * @return: true if the proposed NAT mapping collides with an existing entry. */ static int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { /* Conntrack tracking doesn't keep track of outgoing tuples; only * incoming ones. NAT means they don't have a fixed mapping, * so we invert the tuple and look for the incoming reply. * * We could keep a separate hash if this proves too slow. */ struct nf_conntrack_tuple reply; nf_ct_invert_tuple(&reply, tuple); return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } static bool nf_nat_allow_clash(const struct nf_conn *ct) { return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash; } /** * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry * @tuple: proposed NAT binding * @ignored_ct: our (unconfirmed) conntrack entry * * Same as nf_nat_used_tuple, but also check for rare clash in reverse * direction. Should be called only when @tuple has not been altered, i.e. * @ignored_conntrack will not be subject to NAT. * * @return: true if the proposed NAT mapping collides with existing entry. */ static noinline bool nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_ct) { static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST_BIT; const struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conn *ct; bool taken = true; struct net *net; if (!nf_nat_used_tuple(tuple, ignored_ct)) return false; if (!nf_nat_allow_clash(ignored_ct)) return true; /* Initial choice clashes with existing conntrack. * Check for (rare) reverse collision. * * This can happen when new packets are received in both directions * at the exact same time on different CPUs. * * Without SMP, first packet creates new conntrack entry and second * packet is resolved as established reply packet. * * With parallel processing, both packets could be picked up as * new and both get their own ct entry allocated. * * If ignored_conntrack and colliding ct are not subject to NAT then * pretend the tuple is available and let later clash resolution * handle this at insertion time. * * Without it, the 'reply' packet has its source port rewritten * by nat engine. */ if (READ_ONCE(ignored_ct->status) & uses_nat) return true; net = nf_ct_net(ignored_ct); zone = nf_ct_zone(ignored_ct); thash = nf_conntrack_find_get(net, zone, tuple); if (unlikely(!thash)) /* clashing entry went away */ return false; ct = nf_ct_tuplehash_to_ctrack(thash); /* NB: IP_CT_DIR_ORIGINAL should be impossible because * nf_nat_used_tuple() handles origin collisions. * * Handle remote chance other CPU confirmed its ct right after. */ if (thash->tuple.dst.dir != IP_CT_DIR_REPLY) goto out; /* clashing connection subject to NAT? Retry with new tuple. */ if (READ_ONCE(ct->status) & uses_nat) goto out; if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) && nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) { taken = false; goto out; } out: nf_ct_put(ct); return taken; } static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) { static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | IPS_DYING; static const unsigned long flags_needed = IPS_SRC_NAT; enum tcp_conntrack old_state; old_state = READ_ONCE(ct->proto.tcp.state); if (old_state < TCP_CONNTRACK_TIME_WAIT) return false; if (flags & flags_refuse) return false; return (flags & flags_needed) == flags_needed; } /* reverse direction will send packets to new source, so * make sure such packets are invalid. */ static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) { return (__s32)(new->proto.tcp.seen[0].td_end - old->proto.tcp.seen[0].td_end) > 0; } static int nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack, unsigned int attempts_left) { static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple reply; unsigned long flags; struct nf_conn *ct; bool taken = true; struct net *net; nf_ct_invert_tuple(&reply, tuple); if (attempts_left > NF_NAT_HARDER_THRESH || tuple->dst.protonum != IPPROTO_TCP || ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) return nf_conntrack_tuple_taken(&reply, ignored_conntrack); /* :ast few attempts to find a free tcp port. Destructive * action: evict colliding if its in timewait state and the * tcp sequence number has advanced past the one used by the * old entry. */ net = nf_ct_net(ignored_conntrack); zone = nf_ct_zone(ignored_conntrack); thash = nf_conntrack_find_get(net, zone, &reply); if (!thash) return false; ct = nf_ct_tuplehash_to_ctrack(thash); if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) goto out; if (WARN_ON_ONCE(ct == ignored_conntrack)) goto out; flags = READ_ONCE(ct->status); if (!nf_nat_may_kill(ct, flags)) goto out; if (!nf_seq_has_advanced(ct, ignored_conntrack)) goto out; /* Even if we can evict do not reuse if entry is offloaded. */ if (nf_ct_kill(ct)) taken = flags & flags_offload; out: nf_ct_put(ct); return taken; } static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, const struct nf_nat_range2 *range) { if (t->src.l3num == NFPROTO_IPV4) return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; } /* Is the manipable part of the tuple between min and max incl? */ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, const union nf_conntrack_man_proto *max) { __be16 port; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); case IPPROTO_GRE: /* all fall though */ case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_DCCP: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) port = tuple->src.u.all; else port = tuple->dst.u.all; return ntohs(port) >= ntohs(min->all) && ntohs(port) <= ntohs(max->all); default: return true; } } /* If we source map this tuple so reply looks like reply_tuple, will * that meet the constraints of range. */ static int nf_in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the * range specified, otherwise let this drag us onto a new src IP. */ if (range->flags & NF_NAT_RANGE_MAP_IPS && !nf_nat_inet_in_range(tuple, range)) return 0; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) return 1; return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, &range->min_proto, &range->max_proto); } static inline int same_src(const struct nf_conn *ct, const struct nf_conntrack_tuple *tuple) { const struct nf_conntrack_tuple *t; t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; return (t->dst.protonum == tuple->dst.protonum && nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && t->src.u.all == tuple->src.u.all); } /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) { unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn *ct; hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { if (same_src(ct, tuple) && net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuple(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; if (nf_in_range(result, range)) return 1; } } return 0; } /* For [FUTURE] fragmentation handling, we want the least-used * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports * 1-65535, we don't do pro-rata allocation based on ports; we choose * the ip with the lowest src-ip/dst-ip/proto usage. */ static void find_best_ips_proto(const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) { union nf_inet_addr *var_ipp; unsigned int i, max; /* Host order */ u32 minip, maxip, j, dist; bool full_range; /* No IP mapping? Do nothing. */ if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return; if (maniptype == NF_NAT_MANIP_SRC) var_ipp = &tuple->src.u3; else var_ipp = &tuple->dst.u3; /* Fast path: only one choice. */ if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { *var_ipp = range->min_addr; return; } if (nf_ct_l3num(ct) == NFPROTO_IPV4) max = sizeof(var_ipp->ip) / sizeof(u32) - 1; else max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; /* Hashing source and destination IPs gives a fairly even * spread in practice (if there are a small number of IPs * involved, there usually aren't that many connections * anyway). The consistency means that servers see the same * client coming from the same IP (some Internet Banking sites * like this), even across reboots. */ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); full_range = false; for (i = 0; i <= max; i++) { /* If first bytes of the address are at the maximum, use the * distance. Otherwise use the full range. */ if (!full_range) { minip = ntohl((__force __be32)range->min_addr.all[i]); maxip = ntohl((__force __be32)range->max_addr.all[i]); dist = maxip - minip + 1; } else { minip = 0; dist = ~0; } var_ipp->all[i] = (__force __u32) htonl(minip + reciprocal_scale(j, dist)); if (var_ipp->all[i] != range->max_addr.all[i]) full_range = true; if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) j ^= (__force u32)tuple->dst.u3.all[i]; } } /* Alter the per-proto part of the tuple (depending on maniptype), to * give a unique tuple in the given range if possible. * * Per-protocol part of tuple is initialized to the incoming packet. */ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { unsigned int range_size, min, max, i, attempts; __be16 *keyptr; u16 off; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 0; range_size = 65536; } else { min = ntohs(range->min_proto.icmp.id); range_size = ntohs(range->max_proto.icmp.id) - ntohs(range->min_proto.icmp.id) + 1; } goto find_free_id; #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) case IPPROTO_GRE: /* If there is no master conntrack we are not PPTP, do not change tuples */ if (!ct->master) return; if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.gre.key; else keyptr = &tuple->dst.u.gre.key; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 1; range_size = 65535; } else { min = ntohs(range->min_proto.gre.key); range_size = ntohs(range->max_proto.gre.key) - min + 1; } goto find_free_id; #endif case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_DCCP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else keyptr = &tuple->dst.u.all; break; default: return; } /* If no range specified... */ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { /* If it's dst rewrite, can't change port */ if (maniptype == NF_NAT_MANIP_DST) return; if (ntohs(*keyptr) < 1024) { /* Loose convention: >> 512 is credential passing */ if (ntohs(*keyptr) < 512) { min = 1; range_size = 511 - min + 1; } else { min = 600; range_size = 1023 - min + 1; } } else { min = 1024; range_size = 65535 - 1024 + 1; } } else { min = ntohs(range->min_proto.all); max = ntohs(range->max_proto.all); if (unlikely(max < min)) swap(max, min); range_size = max - min + 1; } find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) || maniptype != NF_NAT_MANIP_DST) off = get_random_u16(); else off = 0; attempts = range_size; if (attempts > NF_NAT_MAX_ATTEMPTS) attempts = NF_NAT_MAX_ATTEMPTS; /* We are in softirq; doing a search of the entire range risks * soft lockup when all tuples are already used. * * If we can't find any free port from first offset, pick a new * one and try again, with ever smaller search window. */ another_round: for (i = 0; i < attempts; i++, off++) { *keyptr = htons(min + off % range_size); if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) return; } if (attempts >= range_size || attempts < 16) return; attempts /= 2; off = get_random_u16(); goto another_round; } /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, * we change the source to map into the range. For NF_INET_PRE_ROUTING * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in * __nf_conntrack_confirm and drop the packet. */ static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); zone = nf_ct_zone(ct); /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given * range, use that. * * This is only required for source (ie. NAT/masq) mappings. * So far, we don't do local source mappings, so multiple * manips not an issue. */ if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ if (nf_in_range(orig_tuple, range)) { if (!nf_nat_used_tuple_new(orig_tuple, ct)) { *tuple = *orig_tuple; return; } } else if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; } } /* 2) Select the least-used IP/proto combination in the given range */ *tuple = *orig_tuple; find_best_ips_proto(zone, tuple, range, ct, maniptype); /* 3) The per-protocol part of the manip is made to map into * the range to make a unique tuple. */ /* Only bother mapping if it's not already in range and unique */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, &range->min_proto, &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; } else if (!nf_nat_used_tuple(tuple, ct)) { return; } } /* Last chance: get protocol to try to obtain unique tuple. */ nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); } struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) { struct nf_conn_nat *nat = nfct_nat(ct); if (nat) return nat; if (!nf_ct_is_confirmed(ct)) nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); return nat; } EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ if (nf_ct_is_confirmed(ct)) return NF_ACCEPT; WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); if (WARN_ON(nf_nat_initialized(ct, maniptype))) return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior * manipulations (future optimization: if num_manips == 0, * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ nf_ct_invert_tuple(&curr_tuple, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { struct nf_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */ nf_ct_invert_tuple(&reply, &new_tuple); nf_conntrack_alter_reply(ct, &reply); /* Non-atomic: we own this at the moment. */ if (maniptype == NF_NAT_MANIP_SRC) ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; if (nfct_help(ct) && !nfct_seqadj(ct)) if (!nfct_seqadj_ext_add(ct)) return NF_DROP; } if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; spinlock_t *lock; srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); spin_unlock_bh(lock); } /* It's done. */ if (maniptype == NF_NAT_MANIP_DST) ct->status |= IPS_DST_NAT_DONE; else ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } EXPORT_SYMBOL(nf_nat_setup_info); static unsigned int __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) { /* Force range to this IP; let proto decide mapping for * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). * Use reply in case it's already been mangled (eg local packet). */ union nf_inet_addr ip = (manip == NF_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); struct nf_nat_range2 range = { .flags = NF_NAT_RANGE_MAP_IPS, .min_addr = ip, .max_addr = ip, }; return nf_nat_setup_info(ct, &range, manip); } unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) { return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); } EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); /* Do packet manipulations according to nf_nat_setup_info. */ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb) { enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned int verdict = NF_ACCEPT; unsigned long statusbit; if (mtype == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply dir. */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; /* Non-atomic: these bits don't change. */ if (ct->status & statusbit) verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); return verdict; } EXPORT_SYMBOL_GPL(nf_nat_packet); static bool in_vrf_postrouting(const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (state->hook == NF_INET_POST_ROUTING && netif_is_l3_master(state->out)) return true; #endif return false; } unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct || in_vrf_postrouting(state)) return NF_ACCEPT; nat = nfct_nat(ct); switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED_REPLY: /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. */ if (!nf_nat_initialized(ct, maniptype)) { struct nf_nat_lookup_hook_priv *lpriv = priv; struct nf_hook_entries *e = rcu_dereference(lpriv->entries); unsigned int ret; int i; if (!e) goto null_bind; for (i = 0; i < e->num_hook_entries; i++) { ret = e->hooks[i].hook(e->hooks[i].priv, skb, state); if (ret != NF_ACCEPT) return ret; if (nf_nat_initialized(ct, maniptype)) goto do_nat; } null_bind: ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct, ct->status); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } break; default: /* ESTABLISHED */ WARN_ON(ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } do_nat: return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); return NF_DROP; } EXPORT_SYMBOL_GPL(nf_nat_inet_fn); struct nf_nat_proto_clean { u8 l3proto; u8 l4proto; }; /* kill conntracks with affected NAT section */ static int nf_nat_proto_remove(struct nf_conn *i, void *data) { const struct nf_nat_proto_clean *clean = data; if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) return 0; return i->status & IPS_NAT_MASK ? 1 : 0; } static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) { if (nf_nat_proto_remove(ct, data)) return 1; /* This module is being removed and conntrack has nat null binding. * Remove it from bysource hash, as the table will be freed soon. * * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. */ return 0; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, }; static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_PROTONAT_PORT_MIN]) { range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); range->max_proto.all = range->min_proto.all; range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[CTA_PROTONAT_PORT_MAX]) { range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } return 0; } static int nfnetlink_parse_nat_proto(struct nlattr *attr, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_PROTONAT_MAX+1]; int err; err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy, NULL); if (err < 0) return err; return nf_nat_l4proto_nlattr_to_range(tb, range); } static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_PROTO] = { .type = NLA_NESTED }, }; static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V4_MINIP]) { range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); range->flags |= NF_NAT_RANGE_MAP_IPS; } range->max_addr.ip = nla_get_be32_default(tb[CTA_NAT_V4_MAXIP], range->min_addr.ip); return 0; } static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V6_MINIP]) { nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], sizeof(struct in6_addr)); range->flags |= NF_NAT_RANGE_MAP_IPS; } if (tb[CTA_NAT_V6_MAXIP]) nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], sizeof(struct in6_addr)); else range->max_addr = range->min_addr; return 0; } static int nfnetlink_parse_nat(const struct nlattr *nat, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_NAT_MAX+1]; int err; memset(range, 0, sizeof(*range)); err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, nat_nla_policy, NULL); if (err < 0) return err; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: err = nf_nat_ipv4_nlattr_to_range(tb, range); break; case NFPROTO_IPV6: err = nf_nat_ipv6_nlattr_to_range(tb, range); break; default: err = -EPROTONOSUPPORT; break; } if (err) return err; if (!tb[CTA_NAT_PROTO]) return 0; return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); } /* This function is called under rcu_read_lock() */ static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { struct nf_nat_range2 range; int err; /* Should not happen, restricted to creating new conntracks * via ctnetlink. */ if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) return -EEXIST; /* No NAT information has been passed, allocate the null-binding */ if (attr == NULL) return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; err = nfnetlink_parse_nat(attr, ct, &range); if (err < 0) return err; return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } #else static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { return -EOPNOTSUPP; } #endif static struct nf_ct_helper_expectfn follow_master_nat = { .name = "nat-follow-master", .expectfn = nf_nat_follow_master, }; int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; unsigned int hooknum = ops->hooknum; struct nf_hook_ops *nat_ops; int i, ret; if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) return -EINVAL; nat_proto_net = &nat_net->nat_proto_net[pf]; for (i = 0; i < ops_count; i++) { if (orig_nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) return -EINVAL; mutex_lock(&nf_nat_proto_mutex); if (!nat_proto_net->nat_hook_ops) { WARN_ON(nat_proto_net->users != 0); nat_ops = kmemdup_array(orig_nat_ops, ops_count, sizeof(*orig_nat_ops), GFP_KERNEL); if (!nat_ops) { mutex_unlock(&nf_nat_proto_mutex); return -ENOMEM; } for (i = 0; i < ops_count; i++) { priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv) { nat_ops[i].priv = priv; continue; } mutex_unlock(&nf_nat_proto_mutex); while (i) kfree(nat_ops[--i].priv); kfree(nat_ops); return -ENOMEM; } ret = nf_register_net_hooks(net, nat_ops, ops_count); if (ret < 0) { mutex_unlock(&nf_nat_proto_mutex); for (i = 0; i < ops_count; i++) kfree(nat_ops[i].priv); kfree(nat_ops); return ret; } nat_proto_net->nat_hook_ops = nat_ops; } nat_ops = nat_proto_net->nat_hook_ops; priv = nat_ops[hooknum].priv; if (WARN_ON_ONCE(!priv)) { mutex_unlock(&nf_nat_proto_mutex); return -EOPNOTSUPP; } ret = nf_hook_entries_insert_raw(&priv->entries, ops); if (ret == 0) nat_proto_net->users++; mutex_unlock(&nf_nat_proto_mutex); return ret; } void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; struct nf_hook_ops *nat_ops; int hooknum = ops->hooknum; int i; if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) return; nat_proto_net = &nat_net->nat_proto_net[pf]; mutex_lock(&nf_nat_proto_mutex); if (WARN_ON(nat_proto_net->users == 0)) goto unlock; nat_proto_net->users--; nat_ops = nat_proto_net->nat_hook_ops; for (i = 0; i < ops_count; i++) { if (nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) goto unlock; priv = nat_ops[hooknum].priv; nf_hook_entries_delete_raw(&priv->entries, ops); if (nat_proto_net->users == 0) { nf_unregister_net_hooks(net, nat_ops, ops_count); for (i = 0; i < ops_count; i++) { priv = nat_ops[i].priv; kfree_rcu(priv, rcu_head); } nat_proto_net->nat_hook_ops = NULL; kfree(nat_ops); } unlock: mutex_unlock(&nf_nat_proto_mutex); } static struct pernet_operations nat_net_ops = { .id = &nat_net_id, .size = sizeof(struct nat_net), }; static const struct nf_nat_hook nat_hook = { .parse_nat_setup = nfnetlink_parse_nat_setup, #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, #endif .remove_nat_bysrc = nf_nat_cleanup_conntrack, }; static int __init nf_nat_init(void) { int ret, i; /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; if (nf_nat_htable_size < CONNTRACK_LOCKS) nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) return -ENOMEM; for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); ret = register_pernet_subsys(&nat_net_ops); if (ret < 0) { kvfree(nf_nat_bysource); return ret; } nf_ct_helper_expectfn_register(&follow_master_nat); WARN_ON(nf_nat_hook != NULL); RCU_INIT_POINTER(nf_nat_hook, &nat_hook); ret = register_nf_nat_bpf(); if (ret < 0) { RCU_INIT_POINTER(nf_nat_hook, NULL); nf_ct_helper_expectfn_unregister(&follow_master_nat); synchronize_net(); unregister_pernet_subsys(&nat_net_ops); kvfree(nf_nat_bysource); } return ret; } static void __exit nf_nat_cleanup(void) { struct nf_nat_proto_clean clean = {}; nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); nf_ct_helper_expectfn_unregister(&follow_master_nat); RCU_INIT_POINTER(nf_nat_hook, NULL); synchronize_net(); kvfree(nf_nat_bysource); unregister_pernet_subsys(&nat_net_ops); } MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Network address translation core"); module_init(nf_nat_init); module_exit(nf_nat_cleanup); |
1 6 4 1 2 1 4 4 4 1 3 1 1 1 6 2 1 3 9 1 1 7 1 2 1 4 4 33 1 32 32 32 26 26 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Point-to-Point Tunneling Protocol for Linux * * Authors: Dmitry Kozlov <xeb@mail.ru> */ #include <linux/string.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/ppp_channel.h> #include <linux/ppp_defs.h> #include <linux/if_pppox.h> #include <linux/ppp-ioctl.h> #include <linux/notifier.h> #include <linux/file.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/rcupdate.h> #include <linux/security.h> #include <linux/spinlock.h> #include <net/sock.h> #include <net/protocol.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/gre.h> #include <net/pptp.h> #include <linux/uaccess.h> #define PPTP_DRIVER_VERSION "0.8.5" #define MAX_CALLID 65535 static DECLARE_BITMAP(callid_bitmap, MAX_CALLID + 1); static struct pppox_sock __rcu **callid_sock; static DEFINE_SPINLOCK(chan_lock); static struct proto pptp_sk_proto __read_mostly; static const struct ppp_channel_ops pptp_chan_ops; static const struct proto_ops pptp_ops; static struct pppox_sock *lookup_chan(u16 call_id, __be32 s_addr) { struct pppox_sock *sock; struct pptp_opt *opt; rcu_read_lock(); sock = rcu_dereference(callid_sock[call_id]); if (sock) { opt = &sock->proto.pptp; if (opt->dst_addr.sin_addr.s_addr != s_addr) sock = NULL; else sock_hold(sk_pppox(sock)); } rcu_read_unlock(); return sock; } static int lookup_chan_dst(u16 call_id, __be32 d_addr) { struct pppox_sock *sock; struct pptp_opt *opt; int i; rcu_read_lock(); i = 1; for_each_set_bit_from(i, callid_bitmap, MAX_CALLID) { sock = rcu_dereference(callid_sock[i]); if (!sock) continue; opt = &sock->proto.pptp; if (opt->dst_addr.call_id == call_id && opt->dst_addr.sin_addr.s_addr == d_addr) break; } rcu_read_unlock(); return i < MAX_CALLID; } static int add_chan(struct pppox_sock *sock, struct pptp_addr *sa) { static int call_id; spin_lock(&chan_lock); if (!sa->call_id) { call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, call_id + 1); if (call_id == MAX_CALLID) { call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, 1); if (call_id == MAX_CALLID) goto out_err; } sa->call_id = call_id; } else if (test_bit(sa->call_id, callid_bitmap)) { goto out_err; } sock->proto.pptp.src_addr = *sa; set_bit(sa->call_id, callid_bitmap); rcu_assign_pointer(callid_sock[sa->call_id], sock); spin_unlock(&chan_lock); return 0; out_err: spin_unlock(&chan_lock); return -1; } static void del_chan(struct pppox_sock *sock) { spin_lock(&chan_lock); clear_bit(sock->proto.pptp.src_addr.call_id, callid_bitmap); RCU_INIT_POINTER(callid_sock[sock->proto.pptp.src_addr.call_id], NULL); spin_unlock(&chan_lock); } static struct rtable *pptp_route_output(const struct pppox_sock *po, struct flowi4 *fl4) { const struct sock *sk = &po->sk; struct net *net; net = sock_net(sk); flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 0, RT_SCOPE_UNIVERSE, IPPROTO_GRE, 0, po->proto.pptp.dst_addr.sin_addr.s_addr, po->proto.pptp.src_addr.sin_addr.s_addr, 0, 0, sock_net_uid(net, sk)); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); } static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { struct sock *sk = chan->private; struct pppox_sock *po = pppox_sk(sk); struct net *net = sock_net(sk); struct pptp_opt *opt = &po->proto.pptp; struct pptp_gre_header *hdr; unsigned int header_len = sizeof(*hdr); struct flowi4 fl4; int islcp; int len; unsigned char *data; __u32 seq_recv; struct rtable *rt; struct net_device *tdev; struct iphdr *iph; int max_headroom; if (sk_pppox(po)->sk_state & PPPOX_DEAD) goto tx_error; rt = pptp_route_output(po, &fl4); if (IS_ERR(rt)) goto tx_error; tdev = rt->dst.dev; max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(*iph) + sizeof(*hdr) + 2; if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); goto tx_error; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } data = skb->data; islcp = ((data[0] << 8) + data[1]) == PPP_LCP && 1 <= data[2] && data[2] <= 7; /* compress protocol field */ if ((opt->ppp_flags & SC_COMP_PROT) && data[0] == 0 && !islcp) skb_pull(skb, 1); /* Put in the address/control bytes if necessary */ if ((opt->ppp_flags & SC_COMP_AC) == 0 || islcp) { data = skb_push(skb, 2); data[0] = PPP_ALLSTATIONS; data[1] = PPP_UI; } len = skb->len; seq_recv = opt->seq_recv; if (opt->ack_sent == seq_recv) header_len -= sizeof(hdr->ack); /* Push down and install GRE header */ skb_push(skb, header_len); hdr = (struct pptp_gre_header *)(skb->data); hdr->gre_hd.flags = GRE_KEY | GRE_VERSION_1 | GRE_SEQ; hdr->gre_hd.protocol = GRE_PROTO_PPP; hdr->call_id = htons(opt->dst_addr.call_id); hdr->seq = htonl(++opt->seq_sent); if (opt->ack_sent != seq_recv) { /* send ack with this message */ hdr->gre_hd.flags |= GRE_ACK; hdr->ack = htonl(seq_recv); opt->ack_sent = seq_recv; } hdr->payload_len = htons(len); /* Push down and install the IP header. */ skb_reset_transport_header(skb); skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); iph = ip_hdr(skb); iph->version = 4; iph->ihl = sizeof(struct iphdr) >> 2; if (ip_dont_fragment(sk, &rt->dst)) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; iph->protocol = IPPROTO_GRE; iph->tos = 0; iph->daddr = fl4.daddr; iph->saddr = fl4.saddr; iph->ttl = ip4_dst_hoplimit(&rt->dst); iph->tot_len = htons(skb->len); skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); nf_reset_ct(skb); skb->ip_summed = CHECKSUM_NONE; ip_select_ident(net, skb, NULL); ip_send_check(iph); ip_local_out(net, skb->sk, skb); return 1; tx_error: kfree_skb(skb); return 1; } static int pptp_rcv_core(struct sock *sk, struct sk_buff *skb) { struct pppox_sock *po = pppox_sk(sk); struct pptp_opt *opt = &po->proto.pptp; int headersize, payload_len, seq; __u8 *payload; struct pptp_gre_header *header; if (!(sk->sk_state & PPPOX_CONNECTED)) { if (sock_queue_rcv_skb(sk, skb)) goto drop; return NET_RX_SUCCESS; } header = (struct pptp_gre_header *)(skb->data); headersize = sizeof(*header); /* test if acknowledgement present */ if (GRE_IS_ACK(header->gre_hd.flags)) { __u32 ack; if (!pskb_may_pull(skb, headersize)) goto drop; header = (struct pptp_gre_header *)(skb->data); /* ack in different place if S = 0 */ ack = GRE_IS_SEQ(header->gre_hd.flags) ? ntohl(header->ack) : ntohl(header->seq); if (ack > opt->ack_recv) opt->ack_recv = ack; /* also handle sequence number wrap-around */ if (WRAPPED(ack, opt->ack_recv)) opt->ack_recv = ack; } else { headersize -= sizeof(header->ack); } /* test if payload present */ if (!GRE_IS_SEQ(header->gre_hd.flags)) goto drop; payload_len = ntohs(header->payload_len); seq = ntohl(header->seq); /* check for incomplete packet (length smaller than expected) */ if (!pskb_may_pull(skb, headersize + payload_len)) goto drop; payload = skb->data + headersize; /* check for expected sequence number */ if (seq < opt->seq_recv + 1 || WRAPPED(opt->seq_recv, seq)) { if ((payload[0] == PPP_ALLSTATIONS) && (payload[1] == PPP_UI) && (PPP_PROTOCOL(payload) == PPP_LCP) && ((payload[4] == PPP_LCP_ECHOREQ) || (payload[4] == PPP_LCP_ECHOREP))) goto allow_packet; } else { opt->seq_recv = seq; allow_packet: skb_pull(skb, headersize); if (payload[0] == PPP_ALLSTATIONS && payload[1] == PPP_UI) { /* chop off address/control */ if (skb->len < 3) goto drop; skb_pull(skb, 2); } skb->ip_summed = CHECKSUM_NONE; skb_set_network_header(skb, skb->head-skb->data); ppp_input(&po->chan, skb); return NET_RX_SUCCESS; } drop: kfree_skb(skb); return NET_RX_DROP; } static int pptp_rcv(struct sk_buff *skb) { struct pppox_sock *po; struct pptp_gre_header *header; struct iphdr *iph; if (skb->pkt_type != PACKET_HOST) goto drop; if (!pskb_may_pull(skb, 12)) goto drop; iph = ip_hdr(skb); header = (struct pptp_gre_header *)skb->data; if (header->gre_hd.protocol != GRE_PROTO_PPP || /* PPTP-GRE protocol for PPTP */ GRE_IS_CSUM(header->gre_hd.flags) || /* flag CSUM should be clear */ GRE_IS_ROUTING(header->gre_hd.flags) || /* flag ROUTING should be clear */ !GRE_IS_KEY(header->gre_hd.flags) || /* flag KEY should be set */ (header->gre_hd.flags & GRE_FLAGS)) /* flag Recursion Ctrl should be clear */ /* if invalid, discard this packet */ goto drop; po = lookup_chan(ntohs(header->call_id), iph->saddr); if (po) { skb_dst_drop(skb); nf_reset_ct(skb); return sk_receive_skb(sk_pppox(po), skb, 0); } drop: kfree_skb(skb); return NET_RX_DROP; } static int pptp_bind(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len) { struct sock *sk = sock->sk; struct sockaddr_pppox *sp = (struct sockaddr_pppox *) uservaddr; struct pppox_sock *po = pppox_sk(sk); int error = 0; if (sockaddr_len < sizeof(struct sockaddr_pppox)) return -EINVAL; lock_sock(sk); if (sk->sk_state & PPPOX_DEAD) { error = -EALREADY; goto out; } if (sk->sk_state & PPPOX_BOUND) { error = -EBUSY; goto out; } if (add_chan(po, &sp->sa_addr.pptp)) error = -EBUSY; else sk->sk_state |= PPPOX_BOUND; out: release_sock(sk); return error; } static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; struct sockaddr_pppox *sp = (struct sockaddr_pppox *) uservaddr; struct pppox_sock *po = pppox_sk(sk); struct pptp_opt *opt = &po->proto.pptp; struct rtable *rt; struct flowi4 fl4; int error = 0; if (sockaddr_len < sizeof(struct sockaddr_pppox)) return -EINVAL; if (sp->sa_protocol != PX_PROTO_PPTP) return -EINVAL; if (lookup_chan_dst(sp->sa_addr.pptp.call_id, sp->sa_addr.pptp.sin_addr.s_addr)) return -EALREADY; lock_sock(sk); /* Check for already bound sockets */ if (sk->sk_state & PPPOX_CONNECTED) { error = -EBUSY; goto end; } /* Check for already disconnected sockets, on attempts to disconnect */ if (sk->sk_state & PPPOX_DEAD) { error = -EALREADY; goto end; } if (!opt->src_addr.sin_addr.s_addr || !sp->sa_addr.pptp.sin_addr.s_addr) { error = -EINVAL; goto end; } po->chan.private = sk; po->chan.ops = &pptp_chan_ops; rt = pptp_route_output(po, &fl4); if (IS_ERR(rt)) { error = -EHOSTUNREACH; goto end; } sk_setup_caps(sk, &rt->dst); po->chan.mtu = dst_mtu(&rt->dst); if (!po->chan.mtu) po->chan.mtu = PPP_MRU; po->chan.mtu -= PPTP_HEADER_OVERHEAD; po->chan.hdrlen = 2 + sizeof(struct pptp_gre_header); po->chan.direct_xmit = true; error = ppp_register_channel(&po->chan); if (error) { pr_err("PPTP: failed to register PPP channel (%d)\n", error); goto end; } opt->dst_addr = sp->sa_addr.pptp; sk->sk_state |= PPPOX_CONNECTED; end: release_sock(sk); return error; } static int pptp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { int len = sizeof(struct sockaddr_pppox); struct sockaddr_pppox sp; memset(&sp.sa_addr, 0, sizeof(sp.sa_addr)); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_PPTP; sp.sa_addr.pptp = pppox_sk(sock->sk)->proto.pptp.src_addr; memcpy(uaddr, &sp, len); return len; } static int pptp_release(struct socket *sock) { struct sock *sk = sock->sk; struct pppox_sock *po; int error = 0; if (!sk) return 0; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) { release_sock(sk); return -EBADF; } po = pppox_sk(sk); del_chan(po); synchronize_rcu(); pppox_unbind_sock(sk); sk->sk_state = PPPOX_DEAD; sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return error; } static void pptp_sock_destruct(struct sock *sk) { if (!(sk->sk_state & PPPOX_DEAD)) { del_chan(pppox_sk(sk)); pppox_unbind_sock(sk); } skb_queue_purge(&sk->sk_receive_queue); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); } static int pptp_create(struct net *net, struct socket *sock, int kern) { int error = -ENOMEM; struct sock *sk; struct pppox_sock *po; struct pptp_opt *opt; sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pptp_sk_proto, kern); if (!sk) goto out; sock_init_data(sock, sk); sock->state = SS_UNCONNECTED; sock->ops = &pptp_ops; sk->sk_backlog_rcv = pptp_rcv_core; sk->sk_state = PPPOX_NONE; sk->sk_type = SOCK_STREAM; sk->sk_family = PF_PPPOX; sk->sk_protocol = PX_PROTO_PPTP; sk->sk_destruct = pptp_sock_destruct; po = pppox_sk(sk); opt = &po->proto.pptp; opt->seq_sent = 0; opt->seq_recv = 0xffffffff; opt->ack_recv = 0; opt->ack_sent = 0xffffffff; error = 0; out: return error; } static int pptp_ppp_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg) { struct sock *sk = chan->private; struct pppox_sock *po = pppox_sk(sk); struct pptp_opt *opt = &po->proto.pptp; void __user *argp = (void __user *)arg; int __user *p = argp; int err, val; err = -EFAULT; switch (cmd) { case PPPIOCGFLAGS: val = opt->ppp_flags; if (put_user(val, p)) break; err = 0; break; case PPPIOCSFLAGS: if (get_user(val, p)) break; opt->ppp_flags = val & ~SC_RCV_BITS; err = 0; break; default: err = -ENOTTY; } return err; } static const struct ppp_channel_ops pptp_chan_ops = { .start_xmit = pptp_xmit, .ioctl = pptp_ppp_ioctl, }; static struct proto pptp_sk_proto __read_mostly = { .name = "PPTP", .owner = THIS_MODULE, .obj_size = sizeof(struct pppox_sock), }; static const struct proto_ops pptp_ops = { .family = AF_PPPOX, .owner = THIS_MODULE, .release = pptp_release, .bind = pptp_bind, .connect = pptp_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pptp_getname, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = pppox_compat_ioctl, #endif }; static const struct pppox_proto pppox_pptp_proto = { .create = pptp_create, .owner = THIS_MODULE, }; static const struct gre_protocol gre_pptp_protocol = { .handler = pptp_rcv, }; static int __init pptp_init_module(void) { int err = 0; pr_info("PPTP driver version " PPTP_DRIVER_VERSION "\n"); callid_sock = vzalloc(array_size(sizeof(void *), (MAX_CALLID + 1))); if (!callid_sock) return -ENOMEM; err = gre_add_protocol(&gre_pptp_protocol, GREPROTO_PPTP); if (err) { pr_err("PPTP: can't add gre protocol\n"); goto out_mem_free; } err = proto_register(&pptp_sk_proto, 0); if (err) { pr_err("PPTP: can't register sk_proto\n"); goto out_gre_del_protocol; } err = register_pppox_proto(PX_PROTO_PPTP, &pppox_pptp_proto); if (err) { pr_err("PPTP: can't register pppox_proto\n"); goto out_unregister_sk_proto; } return 0; out_unregister_sk_proto: proto_unregister(&pptp_sk_proto); out_gre_del_protocol: gre_del_protocol(&gre_pptp_protocol, GREPROTO_PPTP); out_mem_free: vfree(callid_sock); return err; } static void __exit pptp_exit_module(void) { unregister_pppox_proto(PX_PROTO_PPTP); proto_unregister(&pptp_sk_proto); gre_del_protocol(&gre_pptp_protocol, GREPROTO_PPTP); vfree(callid_sock); } module_init(pptp_init_module); module_exit(pptp_exit_module); MODULE_DESCRIPTION("Point-to-Point Tunneling Protocol"); MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_PPTP); |
112 113 331 331 29 111 7 84 84 244 245 113 113 2 111 2 331 331 4 329 111 100 86 84 83 245 245 83 32 77 83 5 54 116 115 73 73 73 73 6 6 6 6 46 73 7 276 129 39 108 133 116 26 98 29 5 423 22 22 127 421 26 421 251 244 17 7 186 188 132 56 130 75 70 45 191 249 52 253 252 46 237 233 7 54 229 73 232 2 173 230 3 230 190 73 230 3 48 174 233 234 52 134 96 234 230 227 7 156 4 10 135 4 16 116 58 150 1 1 1 152 45 114 35 157 157 152 122 57 1 21 21 10 2 21 13 13 13 13 12 2 12 11 11 10 2 9 5 8 1 8 7 5 8 7 7 5 2 5 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2004, OGAWA Hirofumi */ #include <linux/blkdev.h> #include <linux/sched/signal.h> #include <linux/backing-dev-defs.h> #include "fat.h" struct fatent_operations { void (*ent_blocknr)(struct super_block *, int, int *, sector_t *); void (*ent_set_ptr)(struct fat_entry *, int); int (*ent_bread)(struct super_block *, struct fat_entry *, int, sector_t); int (*ent_get)(struct fat_entry *); void (*ent_put)(struct fat_entry *, int); int (*ent_next)(struct fat_entry *); }; static DEFINE_SPINLOCK(fat12_entry_lock); static void fat12_ent_blocknr(struct super_block *sb, int entry, int *offset, sector_t *blocknr) { struct msdos_sb_info *sbi = MSDOS_SB(sb); int bytes = entry + (entry >> 1); WARN_ON(!fat_valid_entry(sbi, entry)); *offset = bytes & (sb->s_blocksize - 1); *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); } static void fat_ent_blocknr(struct super_block *sb, int entry, int *offset, sector_t *blocknr) { struct msdos_sb_info *sbi = MSDOS_SB(sb); int bytes = (entry << sbi->fatent_shift); WARN_ON(!fat_valid_entry(sbi, entry)); *offset = bytes & (sb->s_blocksize - 1); *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); } static void fat12_ent_set_ptr(struct fat_entry *fatent, int offset) { struct buffer_head **bhs = fatent->bhs; if (fatent->nr_bhs == 1) { WARN_ON(offset >= (bhs[0]->b_size - 1)); fatent->u.ent12_p[0] = bhs[0]->b_data + offset; fatent->u.ent12_p[1] = bhs[0]->b_data + (offset + 1); } else { WARN_ON(offset != (bhs[0]->b_size - 1)); fatent->u.ent12_p[0] = bhs[0]->b_data + offset; fatent->u.ent12_p[1] = bhs[1]->b_data; } } static void fat16_ent_set_ptr(struct fat_entry *fatent, int offset) { WARN_ON(offset & (2 - 1)); fatent->u.ent16_p = (__le16 *)(fatent->bhs[0]->b_data + offset); } static void fat32_ent_set_ptr(struct fat_entry *fatent, int offset) { WARN_ON(offset & (4 - 1)); fatent->u.ent32_p = (__le32 *)(fatent->bhs[0]->b_data + offset); } static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent, int offset, sector_t blocknr) { struct buffer_head **bhs = fatent->bhs; WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); fatent->fat_inode = MSDOS_SB(sb)->fat_inode; bhs[0] = sb_bread(sb, blocknr); if (!bhs[0]) goto err; if ((offset + 1) < sb->s_blocksize) fatent->nr_bhs = 1; else { /* This entry is block boundary, it needs the next block */ blocknr++; bhs[1] = sb_bread(sb, blocknr); if (!bhs[1]) goto err_brelse; fatent->nr_bhs = 2; } fat12_ent_set_ptr(fatent, offset); return 0; err_brelse: brelse(bhs[0]); err: fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); return -EIO; } static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, int offset, sector_t blocknr) { const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); fatent->fat_inode = MSDOS_SB(sb)->fat_inode; fatent->bhs[0] = sb_bread(sb, blocknr); if (!fatent->bhs[0]) { fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); return -EIO; } fatent->nr_bhs = 1; ops->ent_set_ptr(fatent, offset); return 0; } static int fat12_ent_get(struct fat_entry *fatent) { u8 **ent12_p = fatent->u.ent12_p; int next; spin_lock(&fat12_entry_lock); if (fatent->entry & 1) next = (*ent12_p[0] >> 4) | (*ent12_p[1] << 4); else next = (*ent12_p[1] << 8) | *ent12_p[0]; spin_unlock(&fat12_entry_lock); next &= 0x0fff; if (next >= BAD_FAT12) next = FAT_ENT_EOF; return next; } static int fat16_ent_get(struct fat_entry *fatent) { int next = le16_to_cpu(*fatent->u.ent16_p); WARN_ON((unsigned long)fatent->u.ent16_p & (2 - 1)); if (next >= BAD_FAT16) next = FAT_ENT_EOF; return next; } static int fat32_ent_get(struct fat_entry *fatent) { int next = le32_to_cpu(*fatent->u.ent32_p) & 0x0fffffff; WARN_ON((unsigned long)fatent->u.ent32_p & (4 - 1)); if (next >= BAD_FAT32) next = FAT_ENT_EOF; return next; } static void fat12_ent_put(struct fat_entry *fatent, int new) { u8 **ent12_p = fatent->u.ent12_p; if (new == FAT_ENT_EOF) new = EOF_FAT12; spin_lock(&fat12_entry_lock); if (fatent->entry & 1) { *ent12_p[0] = (new << 4) | (*ent12_p[0] & 0x0f); *ent12_p[1] = new >> 4; } else { *ent12_p[0] = new & 0xff; *ent12_p[1] = (*ent12_p[1] & 0xf0) | (new >> 8); } spin_unlock(&fat12_entry_lock); mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); if (fatent->nr_bhs == 2) mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode); } static void fat16_ent_put(struct fat_entry *fatent, int new) { if (new == FAT_ENT_EOF) new = EOF_FAT16; *fatent->u.ent16_p = cpu_to_le16(new); mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); } static void fat32_ent_put(struct fat_entry *fatent, int new) { WARN_ON(new & 0xf0000000); new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff; *fatent->u.ent32_p = cpu_to_le32(new); mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); } static int fat12_ent_next(struct fat_entry *fatent) { u8 **ent12_p = fatent->u.ent12_p; struct buffer_head **bhs = fatent->bhs; u8 *nextp = ent12_p[1] + 1 + (fatent->entry & 1); fatent->entry++; if (fatent->nr_bhs == 1) { WARN_ON(ent12_p[0] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 2))); WARN_ON(ent12_p[1] > (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))); if (nextp < (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))) { ent12_p[0] = nextp - 1; ent12_p[1] = nextp; return 1; } } else { WARN_ON(ent12_p[0] != (u8 *)(bhs[0]->b_data + (bhs[0]->b_size - 1))); WARN_ON(ent12_p[1] != (u8 *)bhs[1]->b_data); ent12_p[0] = nextp - 1; ent12_p[1] = nextp; brelse(bhs[0]); bhs[0] = bhs[1]; fatent->nr_bhs = 1; return 1; } ent12_p[0] = NULL; ent12_p[1] = NULL; return 0; } static int fat16_ent_next(struct fat_entry *fatent) { const struct buffer_head *bh = fatent->bhs[0]; fatent->entry++; if (fatent->u.ent16_p < (__le16 *)(bh->b_data + (bh->b_size - 2))) { fatent->u.ent16_p++; return 1; } fatent->u.ent16_p = NULL; return 0; } static int fat32_ent_next(struct fat_entry *fatent) { const struct buffer_head *bh = fatent->bhs[0]; fatent->entry++; if (fatent->u.ent32_p < (__le32 *)(bh->b_data + (bh->b_size - 4))) { fatent->u.ent32_p++; return 1; } fatent->u.ent32_p = NULL; return 0; } static const struct fatent_operations fat12_ops = { .ent_blocknr = fat12_ent_blocknr, .ent_set_ptr = fat12_ent_set_ptr, .ent_bread = fat12_ent_bread, .ent_get = fat12_ent_get, .ent_put = fat12_ent_put, .ent_next = fat12_ent_next, }; static const struct fatent_operations fat16_ops = { .ent_blocknr = fat_ent_blocknr, .ent_set_ptr = fat16_ent_set_ptr, .ent_bread = fat_ent_bread, .ent_get = fat16_ent_get, .ent_put = fat16_ent_put, .ent_next = fat16_ent_next, }; static const struct fatent_operations fat32_ops = { .ent_blocknr = fat_ent_blocknr, .ent_set_ptr = fat32_ent_set_ptr, .ent_bread = fat_ent_bread, .ent_get = fat32_ent_get, .ent_put = fat32_ent_put, .ent_next = fat32_ent_next, }; static inline void lock_fat(struct msdos_sb_info *sbi) { mutex_lock(&sbi->fat_lock); } static inline void unlock_fat(struct msdos_sb_info *sbi) { mutex_unlock(&sbi->fat_lock); } void fat_ent_access_init(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); mutex_init(&sbi->fat_lock); if (is_fat32(sbi)) { sbi->fatent_shift = 2; sbi->fatent_ops = &fat32_ops; } else if (is_fat16(sbi)) { sbi->fatent_shift = 1; sbi->fatent_ops = &fat16_ops; } else if (is_fat12(sbi)) { sbi->fatent_shift = -1; sbi->fatent_ops = &fat12_ops; } else { fat_fs_error(sb, "invalid FAT variant, %u bits", sbi->fat_bits); } } static void mark_fsinfo_dirty(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); if (sb_rdonly(sb) || !is_fat32(sbi)) return; __mark_inode_dirty(sbi->fsinfo_inode, I_DIRTY_SYNC); } static inline int fat_ent_update_ptr(struct super_block *sb, struct fat_entry *fatent, int offset, sector_t blocknr) { struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct buffer_head **bhs = fatent->bhs; /* Is this fatent's blocks including this entry? */ if (!fatent->nr_bhs || bhs[0]->b_blocknr != blocknr) return 0; if (is_fat12(sbi)) { if ((offset + 1) < sb->s_blocksize) { /* This entry is on bhs[0]. */ if (fatent->nr_bhs == 2) { brelse(bhs[1]); fatent->nr_bhs = 1; } } else { /* This entry needs the next block. */ if (fatent->nr_bhs != 2) return 0; if (bhs[1]->b_blocknr != (blocknr + 1)) return 0; } } ops->ent_set_ptr(fatent, offset); return 1; } int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); const struct fatent_operations *ops = sbi->fatent_ops; int err, offset; sector_t blocknr; if (!fat_valid_entry(sbi, entry)) { fatent_brelse(fatent); fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry); return -EIO; } fatent_set_entry(fatent, entry); ops->ent_blocknr(sb, entry, &offset, &blocknr); if (!fat_ent_update_ptr(sb, fatent, offset, blocknr)) { fatent_brelse(fatent); err = ops->ent_bread(sb, fatent, offset, blocknr); if (err) return err; } return ops->ent_get(fatent); } /* FIXME: We can write the blocks as more big chunk. */ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, int nr_bhs) { struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *c_bh; int err, n, copy; err = 0; for (copy = 1; copy < sbi->fats; copy++) { sector_t backup_fat = sbi->fat_length * copy; for (n = 0; n < nr_bhs; n++) { c_bh = sb_getblk(sb, backup_fat + bhs[n]->b_blocknr); if (!c_bh) { err = -ENOMEM; goto error; } /* Avoid race with userspace read via bdev */ lock_buffer(c_bh); memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); set_buffer_uptodate(c_bh); unlock_buffer(c_bh); mark_buffer_dirty_inode(c_bh, sbi->fat_inode); if (sb->s_flags & SB_SYNCHRONOUS) err = sync_dirty_buffer(c_bh); brelse(c_bh); if (err) goto error; } } error: return err; } int fat_ent_write(struct inode *inode, struct fat_entry *fatent, int new, int wait) { struct super_block *sb = inode->i_sb; const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; int err; ops->ent_put(fatent, new); if (wait) { err = fat_sync_bhs(fatent->bhs, fatent->nr_bhs); if (err) return err; } return fat_mirror_bhs(sb, fatent->bhs, fatent->nr_bhs); } static inline int fat_ent_next(struct msdos_sb_info *sbi, struct fat_entry *fatent) { if (sbi->fatent_ops->ent_next(fatent)) { if (fatent->entry < sbi->max_cluster) return 1; } return 0; } static inline int fat_ent_read_block(struct super_block *sb, struct fat_entry *fatent) { const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; sector_t blocknr; int offset; fatent_brelse(fatent); ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); return ops->ent_bread(sb, fatent, offset, blocknr); } static void fat_collect_bhs(struct buffer_head **bhs, int *nr_bhs, struct fat_entry *fatent) { int n, i; for (n = 0; n < fatent->nr_bhs; n++) { for (i = 0; i < *nr_bhs; i++) { if (fatent->bhs[n] == bhs[i]) break; } if (i == *nr_bhs) { get_bh(fatent->bhs[n]); bhs[i] = fatent->bhs[n]; (*nr_bhs)++; } } } int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent, prev_ent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, count, err, nr_bhs, idx_clus; BUG_ON(nr_cluster > (MAX_BUF_PER_PAGE / 2)); /* fixed limit */ lock_fat(sbi); if (sbi->free_clusters != -1 && sbi->free_clus_valid && sbi->free_clusters < nr_cluster) { unlock_fat(sbi); return -ENOSPC; } err = nr_bhs = idx_clus = 0; count = FAT_START_ENT; fatent_init(&prev_ent); fatent_init(&fatent); fatent_set_entry(&fatent, sbi->prev_free + 1); while (count < sbi->max_cluster) { if (fatent.entry >= sbi->max_cluster) fatent.entry = FAT_START_ENT; fatent_set_entry(&fatent, fatent.entry); err = fat_ent_read_block(sb, &fatent); if (err) goto out; /* Find the free entries in a block */ do { if (ops->ent_get(&fatent) == FAT_ENT_FREE) { int entry = fatent.entry; /* make the cluster chain */ ops->ent_put(&fatent, FAT_ENT_EOF); if (prev_ent.nr_bhs) ops->ent_put(&prev_ent, entry); fat_collect_bhs(bhs, &nr_bhs, &fatent); sbi->prev_free = entry; if (sbi->free_clusters != -1) sbi->free_clusters--; cluster[idx_clus] = entry; idx_clus++; if (idx_clus == nr_cluster) goto out; /* * fat_collect_bhs() gets ref-count of bhs, * so we can still use the prev_ent. */ prev_ent = fatent; } count++; if (count == sbi->max_cluster) break; } while (fat_ent_next(sbi, &fatent)); } /* Couldn't allocate the free entries */ sbi->free_clusters = 0; sbi->free_clus_valid = 1; err = -ENOSPC; out: unlock_fat(sbi); mark_fsinfo_dirty(sb); fatent_brelse(&fatent); if (!err) { if (inode_needs_sync(inode)) err = fat_sync_bhs(bhs, nr_bhs); if (!err) err = fat_mirror_bhs(sb, bhs, nr_bhs); } for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); if (err && idx_clus) fat_free_clusters(inode, cluster[0]); return err; } int fat_free_clusters(struct inode *inode, int cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, err, nr_bhs; int first_cl = cluster, dirty_fsinfo = 0; nr_bhs = 0; fatent_init(&fatent); lock_fat(sbi); do { cluster = fat_ent_read(inode, &fatent, cluster); if (cluster < 0) { err = cluster; goto error; } else if (cluster == FAT_ENT_FREE) { fat_fs_error(sb, "%s: deleting FAT entry beyond EOF", __func__); err = -EIO; goto error; } if (sbi->options.discard) { /* * Issue discard for the sectors we no longer * care about, batching contiguous clusters * into one request */ if (cluster != fatent.entry + 1) { int nr_clus = fatent.entry - first_cl + 1; sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl), nr_clus * sbi->sec_per_clus, GFP_NOFS, 0); first_cl = cluster; } } ops->ent_put(&fatent, FAT_ENT_FREE); if (sbi->free_clusters != -1) { sbi->free_clusters++; dirty_fsinfo = 1; } if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) { if (sb->s_flags & SB_SYNCHRONOUS) { err = fat_sync_bhs(bhs, nr_bhs); if (err) goto error; } err = fat_mirror_bhs(sb, bhs, nr_bhs); if (err) goto error; for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); nr_bhs = 0; } fat_collect_bhs(bhs, &nr_bhs, &fatent); } while (cluster != FAT_ENT_EOF); if (sb->s_flags & SB_SYNCHRONOUS) { err = fat_sync_bhs(bhs, nr_bhs); if (err) goto error; } err = fat_mirror_bhs(sb, bhs, nr_bhs); error: fatent_brelse(&fatent); for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); unlock_fat(sbi); if (dirty_fsinfo) mark_fsinfo_dirty(sb); return err; } EXPORT_SYMBOL_GPL(fat_free_clusters); struct fatent_ra { sector_t cur; sector_t limit; unsigned int ra_blocks; sector_t ra_advance; sector_t ra_next; sector_t ra_limit; }; static void fat_ra_init(struct super_block *sb, struct fatent_ra *ra, struct fat_entry *fatent, int ent_limit) { struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; sector_t blocknr, block_end; int offset; /* * This is the sequential read, so ra_pages * 2 (but try to * align the optimal hardware IO size). * [BTW, 128kb covers the whole sectors for FAT12 and FAT16] */ unsigned long ra_pages = sb->s_bdi->ra_pages; unsigned int reada_blocks; if (fatent->entry >= ent_limit) return; if (ra_pages > sb->s_bdi->io_pages) ra_pages = rounddown(ra_pages, sb->s_bdi->io_pages); reada_blocks = ra_pages << (PAGE_SHIFT - sb->s_blocksize_bits + 1); /* Initialize the range for sequential read */ ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); ops->ent_blocknr(sb, ent_limit - 1, &offset, &block_end); ra->cur = 0; ra->limit = (block_end + 1) - blocknr; /* Advancing the window at half size */ ra->ra_blocks = reada_blocks >> 1; ra->ra_advance = ra->cur; ra->ra_next = ra->cur; ra->ra_limit = ra->cur + min_t(sector_t, reada_blocks, ra->limit); } /* Assuming to be called before reading a new block (increments ->cur). */ static void fat_ent_reada(struct super_block *sb, struct fatent_ra *ra, struct fat_entry *fatent) { if (ra->ra_next >= ra->ra_limit) return; if (ra->cur >= ra->ra_advance) { struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct blk_plug plug; sector_t blocknr, diff; int offset; ops->ent_blocknr(sb, fatent->entry, &offset, &blocknr); diff = blocknr - ra->cur; blk_start_plug(&plug); /* * FIXME: we would want to directly use the bio with * pages to reduce the number of segments. */ for (; ra->ra_next < ra->ra_limit; ra->ra_next++) sb_breadahead(sb, ra->ra_next + diff); blk_finish_plug(&plug); /* Advance the readahead window */ ra->ra_advance += ra->ra_blocks; ra->ra_limit += min_t(sector_t, ra->ra_blocks, ra->limit - ra->ra_limit); } ra->cur++; } int fat_count_free_clusters(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; struct fatent_ra fatent_ra; int err = 0, free; lock_fat(sbi); if (sbi->free_clusters != -1 && sbi->free_clus_valid) goto out; free = 0; fatent_init(&fatent); fatent_set_entry(&fatent, FAT_START_ENT); fat_ra_init(sb, &fatent_ra, &fatent, sbi->max_cluster); while (fatent.entry < sbi->max_cluster) { /* readahead of fat blocks */ fat_ent_reada(sb, &fatent_ra, &fatent); err = fat_ent_read_block(sb, &fatent); if (err) goto out; do { if (ops->ent_get(&fatent) == FAT_ENT_FREE) free++; } while (fat_ent_next(sbi, &fatent)); cond_resched(); } sbi->free_clusters = free; sbi->free_clus_valid = 1; mark_fsinfo_dirty(sb); fatent_brelse(&fatent); out: unlock_fat(sbi); return err; } static int fat_trim_clusters(struct super_block *sb, u32 clus, u32 nr_clus) { struct msdos_sb_info *sbi = MSDOS_SB(sb); return sb_issue_discard(sb, fat_clus_to_blknr(sbi, clus), nr_clus * sbi->sec_per_clus, GFP_NOFS, 0); } int fat_trim_fs(struct inode *inode, struct fstrim_range *range) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; struct fatent_ra fatent_ra; u64 ent_start, ent_end, minlen, trimmed = 0; u32 free = 0; int err = 0; /* * FAT data is organized as clusters, trim at the granulary of cluster. * * fstrim_range is in byte, convert values to cluster index. * Treat sectors before data region as all used, not to trim them. */ ent_start = max_t(u64, range->start>>sbi->cluster_bits, FAT_START_ENT); ent_end = ent_start + (range->len >> sbi->cluster_bits) - 1; minlen = range->minlen >> sbi->cluster_bits; if (ent_start >= sbi->max_cluster || range->len < sbi->cluster_size) return -EINVAL; if (ent_end >= sbi->max_cluster) ent_end = sbi->max_cluster - 1; fatent_init(&fatent); lock_fat(sbi); fatent_set_entry(&fatent, ent_start); fat_ra_init(sb, &fatent_ra, &fatent, ent_end + 1); while (fatent.entry <= ent_end) { /* readahead of fat blocks */ fat_ent_reada(sb, &fatent_ra, &fatent); err = fat_ent_read_block(sb, &fatent); if (err) goto error; do { if (ops->ent_get(&fatent) == FAT_ENT_FREE) { free++; } else if (free) { if (free >= minlen) { u32 clus = fatent.entry - free; err = fat_trim_clusters(sb, clus, free); if (err && err != -EOPNOTSUPP) goto error; if (!err) trimmed += free; err = 0; } free = 0; } } while (fat_ent_next(sbi, &fatent) && fatent.entry <= ent_end); if (fatal_signal_pending(current)) { err = -ERESTARTSYS; goto error; } if (need_resched()) { fatent_brelse(&fatent); unlock_fat(sbi); cond_resched(); lock_fat(sbi); } } /* handle scenario when tail entries are all free */ if (free && free >= minlen) { u32 clus = fatent.entry - free; err = fat_trim_clusters(sb, clus, free); if (err && err != -EOPNOTSUPP) goto error; if (!err) trimmed += free; err = 0; } error: fatent_brelse(&fatent); unlock_fat(sbi); range->len = trimmed << sbi->cluster_bits; return err; } |
11494 13214 17 11 37 31 2471 2 3698 13707 9499 3720 6113 9499 10692 10691 2 2 140 140 7074 11475 4687 2937 4685 165 17 37 184 35 218 74 5149 2123 952 1 116 114 2 116 1 116 15076 14970 6809 12675 3911 2134 31 4721 2162 856 590 412 853 381 111 563 5 2 5 5 86 812 854 856 813 814 52 1948 1618 235 7524 22552 1954 1953 19760 19772 15083 13 13214 13218 13214 13208 1953 13210 4996 4993 2473 11 3601 216 13211 13 36 13218 13217 1952 1953 1955 3020 3020 3020 2434 2863 3 858 174 1 173 663 926 662 922 6006 5921 217 6005 105 92 105 5932 5936 5918 5935 5913 113 31 10647 10648 13221 13220 11050 4287 13230 4475 7896 11006 3700 24 3680 12011 13217 259 259 5 891 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2005 SGI, Christoph Lameter * Copyright (C) 2006 Nick Piggin * Copyright (C) 2012 Konstantin Khlebnikov * Copyright (C) 2016 Intel, Matthew Wilcox * Copyright (C) 2016 Intel, Ross Zwisler */ #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/cpu.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kmemleak.h> #include <linux/percpu.h> #include <linux/preempt.h> /* in_interrupt() */ #include <linux/radix-tree.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/xarray.h> #include "radix-tree.h" /* * Radix tree node cache. */ struct kmem_cache *radix_tree_node_cachep; /* * The radix tree is variable-height, so an insert operation not only has * to build the branch to its corresponding item, it also has to build the * branch to existing items if the size has to be increased (by * radix_tree_extend). * * The worst case is a zero height tree with just a single item at index 0, * and then inserting an item at index ULONG_MAX. This requires 2 new branches * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared. * Hence: */ #define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1) /* * The IDR does not have to be as high as the radix tree since it uses * signed integers, not unsigned longs. */ #define IDR_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(int) - 1) #define IDR_MAX_PATH (DIV_ROUND_UP(IDR_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) #define IDR_PRELOAD_SIZE (IDR_MAX_PATH * 2 - 1) /* * Per-cpu pool of preloaded nodes */ DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { .lock = INIT_LOCAL_LOCK(lock), }; EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads); static inline struct radix_tree_node *entry_to_node(void *ptr) { return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); } static inline void *node_to_entry(void *ptr) { return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); } #define RADIX_TREE_RETRY XA_RETRY_ENTRY static inline unsigned long get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot) { return parent ? slot - parent->slots : 0; } static unsigned int radix_tree_descend(const struct radix_tree_node *parent, struct radix_tree_node **nodep, unsigned long index) { unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK; void __rcu **entry = rcu_dereference_raw(parent->slots[offset]); *nodep = (void *)entry; return offset; } static inline gfp_t root_gfp_mask(const struct radix_tree_root *root) { return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK); } static inline void tag_set(struct radix_tree_node *node, unsigned int tag, int offset) { __set_bit(offset, node->tags[tag]); } static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, int offset) { __clear_bit(offset, node->tags[tag]); } static inline int tag_get(const struct radix_tree_node *node, unsigned int tag, int offset) { return test_bit(offset, node->tags[tag]); } static inline void root_tag_set(struct radix_tree_root *root, unsigned tag) { root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag) { root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear_all(struct radix_tree_root *root) { root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1); } static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag) { return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT)); } static inline unsigned root_tags_get(const struct radix_tree_root *root) { return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT; } static inline bool is_idr(const struct radix_tree_root *root) { return !!(root->xa_flags & ROOT_IS_IDR); } /* * Returns 1 if any slot in the node has this tag set. * Otherwise returns 0. */ static inline int any_tag_set(const struct radix_tree_node *node, unsigned int tag) { unsigned idx; for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { if (node->tags[tag][idx]) return 1; } return 0; } static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag) { bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE); } /** * radix_tree_find_next_bit - find the next set bit in a memory region * * @node: where to begin the search * @tag: the tag index * @offset: the bitnumber to start searching at * * Unrollable variant of find_next_bit() for constant size arrays. * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero. * Returns next bit offset, or size if nothing found. */ static __always_inline unsigned long radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, unsigned long offset) { const unsigned long *addr = node->tags[tag]; if (offset < RADIX_TREE_MAP_SIZE) { unsigned long tmp; addr += offset / BITS_PER_LONG; tmp = *addr >> (offset % BITS_PER_LONG); if (tmp) return __ffs(tmp) + offset; offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); while (offset < RADIX_TREE_MAP_SIZE) { tmp = *++addr; if (tmp) return __ffs(tmp) + offset; offset += BITS_PER_LONG; } } return RADIX_TREE_MAP_SIZE; } static unsigned int iter_offset(const struct radix_tree_iter *iter) { return iter->index & RADIX_TREE_MAP_MASK; } /* * The maximum index which can be stored in a radix tree */ static inline unsigned long shift_maxindex(unsigned int shift) { return (RADIX_TREE_MAP_SIZE << shift) - 1; } static inline unsigned long node_maxindex(const struct radix_tree_node *node) { return shift_maxindex(node->shift); } static unsigned long next_index(unsigned long index, const struct radix_tree_node *node, unsigned long offset) { return (index & ~node_maxindex(node)) + (offset << node->shift); } /* * This assumes that the caller has performed appropriate preallocation, and * that the caller has pinned this thread of control to the current CPU. */ static struct radix_tree_node * radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent, struct radix_tree_root *root, unsigned int shift, unsigned int offset, unsigned int count, unsigned int nr_values) { struct radix_tree_node *ret = NULL; /* * Preload code isn't irq safe and it doesn't make sense to use * preloading during an interrupt anyway as all the allocations have * to be atomic. So just do normal allocation when in interrupt. */ if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) { struct radix_tree_preload *rtp; /* * Even if the caller has preloaded, try to allocate from the * cache first for the new node to get accounted to the memory * cgroup. */ ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask | __GFP_NOWARN); if (ret) goto out; /* * Provided the caller has preloaded here, we will always * succeed in getting a node here (and never reach * kmem_cache_alloc) */ rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes; rtp->nodes = ret->parent; rtp->nr--; } /* * Update the allocation stack trace as this is more useful * for debugging. */ kmemleak_update_trace(ret); goto out; } ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); out: BUG_ON(radix_tree_is_internal_node(ret)); if (ret) { ret->shift = shift; ret->offset = offset; ret->count = count; ret->nr_values = nr_values; ret->parent = parent; ret->array = root; } return ret; } void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); /* * Must only free zeroed nodes into the slab. We can be left with * non-NULL entries by radix_tree_free_nodes, so clear the entries * and tags here. */ memset(node->slots, 0, sizeof(node->slots)); memset(node->tags, 0, sizeof(node->tags)); INIT_LIST_HEAD(&node->private_list); kmem_cache_free(radix_tree_node_cachep, node); } static inline void radix_tree_node_free(struct radix_tree_node *node) { call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On * success, return zero, with preemption disabled. On error, return -ENOMEM * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; int ret = -ENOMEM; /* * Nodes preloaded by one cgroup can be used by another cgroup, so * they should never be accounted to any particular memory cgroup. */ gfp_mask &= ~__GFP_ACCOUNT; local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); while (rtp->nr < nr) { local_unlock(&radix_tree_preloads.lock); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr < nr) { node->parent = rtp->nodes; rtp->nodes = node; rtp->nr++; } else { kmem_cache_free(radix_tree_node_cachep, node); } } ret = 0; out: return ret; } /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On * success, return zero, with preemption disabled. On error, return -ENOMEM * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ int radix_tree_preload(gfp_t gfp_mask) { /* Warn on non-sensical use... */ WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); } EXPORT_SYMBOL(radix_tree_preload); /* * The same as above function, except we don't guarantee preloading happens. * We do it, if we decide it helps. On success, return zero with preemption * disabled. On error, return -ENOMEM with preemption not disabled. */ int radix_tree_maybe_preload(gfp_t gfp_mask) { if (gfpflags_allow_blocking(gfp_mask)) return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); /* Preloading doesn't help anything with this gfp mask, skip it */ local_lock(&radix_tree_preloads.lock); return 0; } EXPORT_SYMBOL(radix_tree_maybe_preload); static unsigned radix_tree_load_root(const struct radix_tree_root *root, struct radix_tree_node **nodep, unsigned long *maxindex) { struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); *nodep = node; if (likely(radix_tree_is_internal_node(node))) { node = entry_to_node(node); *maxindex = node_maxindex(node); return node->shift + RADIX_TREE_MAP_SHIFT; } *maxindex = 0; return 0; } /* * Extend a radix tree so it can store key @index. */ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp, unsigned long index, unsigned int shift) { void *entry; unsigned int maxshift; int tag; /* Figure out what the shift should be. */ maxshift = shift; while (index > shift_maxindex(maxshift)) maxshift += RADIX_TREE_MAP_SHIFT; entry = rcu_dereference_raw(root->xa_head); if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE))) goto out; do { struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL, root, shift, 0, 1, 0); if (!node) return -ENOMEM; if (is_idr(root)) { all_tag_set(node, IDR_FREE); if (!root_tag_get(root, IDR_FREE)) { tag_clear(node, IDR_FREE, 0); root_tag_set(root, IDR_FREE); } } else { /* Propagate the aggregated tag info to the new child */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { if (root_tag_get(root, tag)) tag_set(node, tag, 0); } } BUG_ON(shift > BITS_PER_LONG); if (radix_tree_is_internal_node(entry)) { entry_to_node(entry)->parent = node; } else if (xa_is_value(entry)) { /* Moving a value entry root->xa_head to a node */ node->nr_values = 1; } /* * entry was already in the radix tree, so we do not need * rcu_assign_pointer here */ node->slots[0] = (void __rcu *)entry; entry = node_to_entry(node); rcu_assign_pointer(root->xa_head, entry); shift += RADIX_TREE_MAP_SHIFT; } while (shift <= maxshift); out: return maxshift + RADIX_TREE_MAP_SHIFT; } /** * radix_tree_shrink - shrink radix tree to minimum height * @root: radix tree root */ static inline bool radix_tree_shrink(struct radix_tree_root *root) { bool shrunk = false; for (;;) { struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); struct radix_tree_node *child; if (!radix_tree_is_internal_node(node)) break; node = entry_to_node(node); /* * The candidate node has more than one child, or its child * is not at the leftmost slot, we cannot shrink. */ if (node->count != 1) break; child = rcu_dereference_raw(node->slots[0]); if (!child) break; /* * For an IDR, we must not shrink entry 0 into the root in * case somebody calls idr_replace() with a pointer that * appears to be an internal entry */ if (!node->shift && is_idr(root)) break; if (radix_tree_is_internal_node(child)) entry_to_node(child)->parent = NULL; /* * We don't need rcu_assign_pointer(), since we are simply * moving the node from one part of the tree to another: if it * was safe to dereference the old pointer to it * (node->slots[0]), it will be safe to dereference the new * one (root->xa_head) as far as dependent read barriers go. */ root->xa_head = (void __rcu *)child; if (is_idr(root) && !tag_get(node, IDR_FREE, 0)) root_tag_clear(root, IDR_FREE); /* * We have a dilemma here. The node's slot[0] must not be * NULLed in case there are concurrent lookups expecting to * find the item. However if this was a bottom-level node, * then it may be subject to the slot pointer being visible * to callers dereferencing it. If item corresponding to * slot[0] is subsequently deleted, these callers would expect * their slot to become empty sooner or later. * * For example, lockless pagecache will look up a slot, deref * the page pointer, and if the page has 0 refcount it means it * was concurrently deleted from pagecache so try the deref * again. Fortunately there is already a requirement for logic * to retry the entire slot lookup -- the indirect pointer * problem (replacing direct root node with an indirect pointer * also results in a stale slot). So tag the slot as indirect * to force callers to retry. */ node->count = 0; if (!radix_tree_is_internal_node(child)) { node->slots[0] = (void __rcu *)RADIX_TREE_RETRY; } WARN_ON_ONCE(!list_empty(&node->private_list)); radix_tree_node_free(node); shrunk = true; } return shrunk; } static bool delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { bool deleted = false; do { struct radix_tree_node *parent; if (node->count) { if (node_to_entry(node) == rcu_dereference_raw(root->xa_head)) deleted |= radix_tree_shrink(root); return deleted; } parent = node->parent; if (parent) { parent->slots[node->offset] = NULL; parent->count--; } else { /* * Shouldn't the tags already have all been cleared * by the caller? */ if (!is_idr(root)) root_tag_clear_all(root); root->xa_head = NULL; } WARN_ON_ONCE(!list_empty(&node->private_list)); radix_tree_node_free(node); deleted = true; node = parent; } while (node); return deleted; } /** * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key * @nodep: returns node * @slotp: returns slot * * Create, if necessary, and return the node and slot for an item * at position @index in the radix tree @root. * * Until there is more than one item in the tree, no nodes are * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. * * Returns -ENOMEM, or 0 for success. */ static int __radix_tree_create(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp) { struct radix_tree_node *node = NULL, *child; void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex; unsigned int shift, offset = 0; unsigned long max = index; gfp_t gfp = root_gfp_mask(root); shift = radix_tree_load_root(root, &child, &maxindex); /* Make sure the tree is high enough. */ if (max > maxindex) { int error = radix_tree_extend(root, gfp, max, shift); if (error < 0) return error; shift = error; child = rcu_dereference_raw(root->xa_head); } while (shift > 0) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ child = radix_tree_node_alloc(gfp, node, root, shift, offset, 0, 0); if (!child) return -ENOMEM; rcu_assign_pointer(*slot, node_to_entry(child)); if (node) node->count++; } else if (!radix_tree_is_internal_node(child)) break; /* Go a level down */ node = entry_to_node(child); offset = radix_tree_descend(node, &child, index); slot = &node->slots[offset]; } if (nodep) *nodep = node; if (slotp) *slotp = slot; return 0; } /* * Free any nodes below this node. The tree is presumed to not need * shrinking, and any user data in the tree is presumed to not need a * destructor called on it. If we need to add a destructor, we can * add that functionality later. Note that we may not clear tags or * slots from the tree as an RCU walker may still have a pointer into * this subtree. We could replace the entries with RADIX_TREE_RETRY, * but we'll still have to clear those in rcu_free. */ static void radix_tree_free_nodes(struct radix_tree_node *node) { unsigned offset = 0; struct radix_tree_node *child = entry_to_node(node); for (;;) { void *entry = rcu_dereference_raw(child->slots[offset]); if (xa_is_node(entry) && child->shift) { child = entry_to_node(entry); offset = 0; continue; } offset++; while (offset == RADIX_TREE_MAP_SIZE) { struct radix_tree_node *old = child; offset = child->offset + 1; child = child->parent; WARN_ON_ONCE(!list_empty(&old->private_list)); radix_tree_node_free(old); if (old == entry_to_node(node)) return; } } } static inline int insert_entries(struct radix_tree_node *node, void __rcu **slot, void *item) { if (*slot) return -EEXIST; rcu_assign_pointer(*slot, item); if (node) { node->count++; if (xa_is_value(item)) node->nr_values++; } return 1; } /** * radix_tree_insert - insert into a radix tree * @root: radix tree root * @index: index key * @item: item to insert * * Insert an item into the radix tree at position @index. */ int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node; void __rcu **slot; int error; BUG_ON(radix_tree_is_internal_node(item)); error = __radix_tree_create(root, index, &node, &slot); if (error) return error; error = insert_entries(node, slot, item); if (error < 0) return error; if (node) { unsigned offset = get_slot_offset(node, slot); BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); } else { BUG_ON(root_tags_get(root)); } return 0; } EXPORT_SYMBOL(radix_tree_insert); /** * __radix_tree_lookup - lookup an item in a radix tree * @root: radix tree root * @index: index key * @nodep: returns node * @slotp: returns slot * * Lookup and return the item at position @index in the radix * tree @root. * * Until there is more than one item in the tree, no nodes are * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. */ void *__radix_tree_lookup(const struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp) { struct radix_tree_node *node, *parent; unsigned long maxindex; void __rcu **slot; restart: parent = NULL; slot = (void __rcu **)&root->xa_head; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return NULL; while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); slot = parent->slots + offset; if (node == RADIX_TREE_RETRY) goto restart; if (parent->shift == 0) break; } if (nodep) *nodep = parent; if (slotp) *slotp = slot; return node; } /** * radix_tree_lookup_slot - lookup a slot in a radix tree * @root: radix tree root * @index: index key * * Returns: the slot corresponding to the position @index in the * radix tree @root. This is useful for update-if-exists operations. * * This function can be called under rcu_read_lock iff the slot is not * modified by radix_tree_replace_slot, otherwise it must be called * exclusive from other writers. Any dereference of the slot must be done * using radix_tree_deref_slot. */ void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root, unsigned long index) { void __rcu **slot; if (!__radix_tree_lookup(root, index, NULL, &slot)) return NULL; return slot; } EXPORT_SYMBOL(radix_tree_lookup_slot); /** * radix_tree_lookup - perform lookup operation on a radix tree * @root: radix tree root * @index: index key * * Lookup the item at the position @index in the radix tree @root. * * This function can be called under rcu_read_lock, however the caller * must manage lifetimes of leaf nodes (eg. RCU may also be used to free * them safely). No RCU barriers are required to access or modify the * returned item, however. */ void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index) { return __radix_tree_lookup(root, index, NULL, NULL); } EXPORT_SYMBOL(radix_tree_lookup); static void replace_slot(void __rcu **slot, void *item, struct radix_tree_node *node, int count, int values) { if (node && (count || values)) { node->count += count; node->nr_values += values; } rcu_assign_pointer(*slot, item); } static bool node_tag_get(const struct radix_tree_root *root, const struct radix_tree_node *node, unsigned int tag, unsigned int offset) { if (node) return tag_get(node, tag, offset); return root_tag_get(root, tag); } /* * IDR users want to be able to store NULL in the tree, so if the slot isn't * free, don't adjust the count, even if it's transitioning between NULL and * non-NULL. For the IDA, we mark slots as being IDR_FREE while they still * have empty bits, but it only stores NULL in slots when they're being * deleted. */ static int calculate_count(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot, void *item, void *old) { if (is_idr(root)) { unsigned offset = get_slot_offset(node, slot); bool free = node_tag_get(root, node, IDR_FREE, offset); if (!free) return 0; if (!old) return 1; } return !!item - !!old; } /** * __radix_tree_replace - replace item in a slot * @root: radix tree root * @node: pointer to tree node * @slot: pointer to slot in @node * @item: new item to store in the slot. * * For use with __radix_tree_lookup(). Caller must hold tree write locked * across slot lookup and replacement. */ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot, void *item) { void *old = rcu_dereference_raw(*slot); int values = !!xa_is_value(item) - !!xa_is_value(old); int count = calculate_count(root, node, slot, item, old); /* * This function supports replacing value entries and * deleting entries, but that needs accounting against the * node unless the slot is root->xa_head. */ WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) && (count || values)); replace_slot(slot, item, node, count, values); if (!node) return; delete_node(root, node); } /** * radix_tree_replace_slot - replace item in a slot * @root: radix tree root * @slot: pointer to slot * @item: new item to store in the slot. * * For use with radix_tree_lookup_slot() and * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked * across slot lookup and replacement. * * NOTE: This cannot be used to switch between non-entries (empty slots), * regular entries, and value entries, as that requires accounting * inside the radix tree node. When switching from one type of entry or * deleting, use __radix_tree_lookup() and __radix_tree_replace() or * radix_tree_iter_replace(). */ void radix_tree_replace_slot(struct radix_tree_root *root, void __rcu **slot, void *item) { __radix_tree_replace(root, NULL, slot, item); } EXPORT_SYMBOL(radix_tree_replace_slot); /** * radix_tree_iter_replace - replace item in a slot * @root: radix tree root * @iter: iterator state * @slot: pointer to slot * @item: new item to store in the slot. * * For use with radix_tree_for_each_slot(). * Caller must hold tree write locked. */ void radix_tree_iter_replace(struct radix_tree_root *root, const struct radix_tree_iter *iter, void __rcu **slot, void *item) { __radix_tree_replace(root, iter->node, slot, item); } static void node_tag_set(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) { while (node) { if (tag_get(node, tag, offset)) return; tag_set(node, tag, offset); offset = node->offset; node = node->parent; } if (!root_tag_get(root, tag)) root_tag_set(root, tag); } /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index * * Set the search tag (which must be < RADIX_TREE_MAX_TAGS) * corresponding to @index in the radix tree. From * the root all the way down to the leaf node. * * Returns the address of the tagged item. Setting a tag on a not-present * item is a bug. */ void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; radix_tree_load_root(root, &node, &maxindex); BUG_ON(index > maxindex); while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); BUG_ON(!node); if (!tag_get(parent, tag, offset)) tag_set(parent, tag, offset); } /* set the root's tag bit */ if (!root_tag_get(root, tag)) root_tag_set(root, tag); return node; } EXPORT_SYMBOL(radix_tree_tag_set); static void node_tag_clear(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) { while (node) { if (!tag_get(node, tag, offset)) return; tag_clear(node, tag, offset); if (any_tag_set(node, tag)) return; offset = node->offset; node = node->parent; } /* clear the root's tag bit */ if (root_tag_get(root, tag)) root_tag_clear(root, tag); } /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index * * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS) * corresponding to @index in the radix tree. If this causes * the leaf node to have no tags set then clear the tag in the * next-to-leaf node, etc. * * Returns the address of the tagged item on success, else NULL. ie: * has the same return value and semantics as radix_tree_lookup(). */ void *radix_tree_tag_clear(struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; int offset = 0; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return NULL; parent = NULL; while (radix_tree_is_internal_node(node)) { parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); } if (node) node_tag_clear(root, parent, tag, offset); return node; } EXPORT_SYMBOL(radix_tree_tag_clear); /** * radix_tree_iter_tag_clear - clear a tag on the current iterator entry * @root: radix tree root * @iter: iterator state * @tag: tag to clear */ void radix_tree_iter_tag_clear(struct radix_tree_root *root, const struct radix_tree_iter *iter, unsigned int tag) { node_tag_clear(root, iter->node, tag, iter_offset(iter)); } /** * radix_tree_tag_get - get a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index (< RADIX_TREE_MAX_TAGS) * * Return values: * * 0: tag not present or not set * 1: tag set * * Note that the return value of this function may not be relied on, even if * the RCU lock is held, unless tag modification and node deletion are excluded * from concurrency. */ int radix_tree_tag_get(const struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; if (!root_tag_get(root, tag)) return 0; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return 0; while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); if (!tag_get(parent, tag, offset)) return 0; if (node == RADIX_TREE_RETRY) break; } return 1; } EXPORT_SYMBOL(radix_tree_tag_get); /* Construct iter->tags bit-mask from node->tags[tag] array */ static void set_iter_tags(struct radix_tree_iter *iter, struct radix_tree_node *node, unsigned offset, unsigned tag) { unsigned tag_long = offset / BITS_PER_LONG; unsigned tag_bit = offset % BITS_PER_LONG; if (!node) { iter->tags = 1; return; } iter->tags = node->tags[tag][tag_long] >> tag_bit; /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ if (tag_long < RADIX_TREE_TAG_LONGS - 1) { /* Pick tags from next element */ if (tag_bit) iter->tags |= node->tags[tag][tag_long + 1] << (BITS_PER_LONG - tag_bit); /* Clip chunk size, here only BITS_PER_LONG tags */ iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG); } } void __rcu **radix_tree_iter_resume(void __rcu **slot, struct radix_tree_iter *iter) { iter->index = __radix_tree_iter_add(iter, 1); iter->next_index = iter->index; iter->tags = 0; return NULL; } EXPORT_SYMBOL(radix_tree_iter_resume); /** * radix_tree_next_chunk - find next chunk of slots for iteration * * @root: radix tree root * @iter: iterator state * @flags: RADIX_TREE_ITER_* flags and tag index * Returns: pointer to chunk first slot, or NULL if iteration is over */ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned flags) { unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; struct radix_tree_node *node, *child; unsigned long index, offset, maxindex; if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) return NULL; /* * Catch next_index overflow after ~0UL. iter->index never overflows * during iterating; it can be zero only at the beginning. * And we cannot overflow iter->next_index in a single step, * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. * * This condition also used by radix_tree_next_slot() to stop * contiguous iterating, and forbid switching to the next chunk. */ index = iter->next_index; if (!index && iter->index) return NULL; restart: radix_tree_load_root(root, &child, &maxindex); if (index > maxindex) return NULL; if (!child) return NULL; if (!radix_tree_is_internal_node(child)) { /* Single-slot tree */ iter->index = index; iter->next_index = maxindex + 1; iter->tags = 1; iter->node = NULL; return (void __rcu **)&root->xa_head; } do { node = entry_to_node(child); offset = radix_tree_descend(node, &child, index); if ((flags & RADIX_TREE_ITER_TAGGED) ? !tag_get(node, tag, offset) : !child) { /* Hole detected */ if (flags & RADIX_TREE_ITER_CONTIG) return NULL; if (flags & RADIX_TREE_ITER_TAGGED) offset = radix_tree_find_next_bit(node, tag, offset + 1); else while (++offset < RADIX_TREE_MAP_SIZE) { void *slot = rcu_dereference_raw( node->slots[offset]); if (slot) break; } index &= ~node_maxindex(node); index += offset << node->shift; /* Overflow after ~0UL */ if (!index) return NULL; if (offset == RADIX_TREE_MAP_SIZE) goto restart; child = rcu_dereference_raw(node->slots[offset]); } if (!child) goto restart; if (child == RADIX_TREE_RETRY) break; } while (node->shift && radix_tree_is_internal_node(child)); /* Update the iterator state */ iter->index = (index &~ node_maxindex(node)) | offset; iter->next_index = (index | node_maxindex(node)) + 1; iter->node = node; if (flags & RADIX_TREE_ITER_TAGGED) set_iter_tags(iter, node, offset, tag); return node->slots + offset; } EXPORT_SYMBOL(radix_tree_next_chunk); /** * radix_tree_gang_lookup - perform multiple lookup on a radix tree * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * * Performs an index-ascending scan of the tree for present items. Places * them at *@results and returns the number of items which were placed at * *@results. * * The implementation is naive. * * Like radix_tree_lookup, radix_tree_gang_lookup may be called under * rcu_read_lock. In this case, rather than the returned results being * an atomic snapshot of the tree at a single point in time, the * semantics of an RCU protected gang lookup are as though multiple * radix_tree_lookups have been issued in individual locks, and results * stored in 'results'. */ unsigned int radix_tree_gang_lookup(const struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_slot(slot, root, &iter, first_index) { results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup); /** * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree * based on a tag * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * @tag: the tag index (< RADIX_TREE_MAX_TAGS) * * Performs an index-ascending scan of the tree for present items which * have the tag indexed by @tag set. Places the items at *@results and * returns the number of items which were placed at *@results. */ unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items, unsigned int tag) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_tag); /** * radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a * radix tree based on a tag * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * @tag: the tag index (< RADIX_TREE_MAX_TAGS) * * Performs an index-ascending scan of the tree for present items which * have the tag indexed by @tag set. Places the slots at *@results and * returns the number of slots which were placed at *@results. */ unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root, void __rcu ***results, unsigned long first_index, unsigned int max_items, unsigned int tag) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { results[ret] = slot; if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); static bool __radix_tree_delete(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot) { void *old = rcu_dereference_raw(*slot); int values = xa_is_value(old) ? -1 : 0; unsigned offset = get_slot_offset(node, slot); int tag; if (is_idr(root)) node_tag_set(root, node, IDR_FREE, offset); else for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) node_tag_clear(root, node, tag, offset); replace_slot(slot, NULL, node, -1, values); return node && delete_node(root, node); } /** * radix_tree_iter_delete - delete the entry at this iterator position * @root: radix tree root * @iter: iterator state * @slot: pointer to slot * * Delete the entry at the position currently pointed to by the iterator. * This may result in the current node being freed; if it is, the iterator * is advanced so that it will not reference the freed memory. This * function may be called without any locking if there are no other threads * which can access this tree. */ void radix_tree_iter_delete(struct radix_tree_root *root, struct radix_tree_iter *iter, void __rcu **slot) { if (__radix_tree_delete(root, iter->node, slot)) iter->index = iter->next_index; } EXPORT_SYMBOL(radix_tree_iter_delete); /** * radix_tree_delete_item - delete an item from a radix tree * @root: radix tree root * @index: index key * @item: expected item * * Remove @item at @index from the radix tree rooted at @root. * * Return: the deleted entry, or %NULL if it was not present * or the entry at the given @index was not @item. */ void *radix_tree_delete_item(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node = NULL; void __rcu **slot = NULL; void *entry; entry = __radix_tree_lookup(root, index, &node, &slot); if (!slot) return NULL; if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE, get_slot_offset(node, slot)))) return NULL; if (item && entry != item) return NULL; __radix_tree_delete(root, node, slot); return entry; } EXPORT_SYMBOL(radix_tree_delete_item); /** * radix_tree_delete - delete an entry from a radix tree * @root: radix tree root * @index: index key * * Remove the entry at @index from the radix tree rooted at @root. * * Return: The deleted entry, or %NULL if it was not present. */ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) { return radix_tree_delete_item(root, index, NULL); } EXPORT_SYMBOL(radix_tree_delete); /** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root * @tag: tag to test */ int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag) { return root_tag_get(root, tag); } EXPORT_SYMBOL(radix_tree_tagged); /** * idr_preload - preload for idr_alloc() * @gfp_mask: allocation mask to use for preloading * * Preallocate memory to use for the next call to idr_alloc(). This function * returns with preemption disabled. It will be enabled by idr_preload_end(). */ void idr_preload(gfp_t gfp_mask) { if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE)) local_lock(&radix_tree_preloads.lock); } EXPORT_SYMBOL(idr_preload); void __rcu **idr_get_free(struct radix_tree_root *root, struct radix_tree_iter *iter, gfp_t gfp, unsigned long max) { struct radix_tree_node *node = NULL, *child; void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex, start = iter->next_index; unsigned int shift, offset = 0; grow: shift = radix_tree_load_root(root, &child, &maxindex); if (!radix_tree_tagged(root, IDR_FREE)) start = max(start, maxindex + 1); if (start > max) return ERR_PTR(-ENOSPC); if (start > maxindex) { int error = radix_tree_extend(root, gfp, start, shift); if (error < 0) return ERR_PTR(error); shift = error; child = rcu_dereference_raw(root->xa_head); } if (start == 0 && shift == 0) shift = RADIX_TREE_MAP_SHIFT; while (shift) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ child = radix_tree_node_alloc(gfp, node, root, shift, offset, 0, 0); if (!child) return ERR_PTR(-ENOMEM); all_tag_set(child, IDR_FREE); rcu_assign_pointer(*slot, node_to_entry(child)); if (node) node->count++; } else if (!radix_tree_is_internal_node(child)) break; node = entry_to_node(child); offset = radix_tree_descend(node, &child, start); if (!tag_get(node, IDR_FREE, offset)) { offset = radix_tree_find_next_bit(node, IDR_FREE, offset + 1); start = next_index(start, node, offset); if (start > max || start == 0) return ERR_PTR(-ENOSPC); while (offset == RADIX_TREE_MAP_SIZE) { offset = node->offset + 1; node = node->parent; if (!node) goto grow; shift = node->shift; } child = rcu_dereference_raw(node->slots[offset]); } slot = &node->slots[offset]; } iter->index = start; if (node) iter->next_index = 1 + min(max, (start | node_maxindex(node))); else iter->next_index = 1; iter->node = node; set_iter_tags(iter, node, offset, IDR_FREE); return slot; } /** * idr_destroy - release all internal memory from an IDR * @idr: idr handle * * After this function is called, the IDR is empty, and may be reused or * the data structure containing it may be freed. * * A typical clean-up sequence for objects stored in an idr tree will use * idr_for_each() to free all objects, if necessary, then idr_destroy() to * free the memory used to keep track of those objects. */ void idr_destroy(struct idr *idr) { struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head); if (radix_tree_is_internal_node(node)) radix_tree_free_nodes(node); idr->idr_rt.xa_head = NULL; root_tag_set(&idr->idr_rt, IDR_FREE); } EXPORT_SYMBOL(idr_destroy); static void radix_tree_node_ctor(void *arg) { struct radix_tree_node *node = arg; memset(node, 0, sizeof(*node)); INIT_LIST_HEAD(&node->private_list); } static int radix_tree_cpu_dead(unsigned int cpu) { struct radix_tree_preload *rtp; struct radix_tree_node *node; /* Free per-cpu pool of preloaded nodes */ rtp = &per_cpu(radix_tree_preloads, cpu); while (rtp->nr) { node = rtp->nodes; rtp->nodes = node->parent; kmem_cache_free(radix_tree_node_cachep, node); rtp->nr--; } return 0; } void __init radix_tree_init(void) { int ret; BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK); BUILD_BUG_ON(XA_CHUNK_SIZE > 255); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead", NULL, radix_tree_cpu_dead); WARN_ON(ret < 0); } |
1164 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Because linux/module.h has tracepoints in the header, and ftrace.h * used to include this file, define_trace.h includes linux/module.h * But we do not want the module.h to override the TRACE_SYSTEM macro * variable that define_trace.h is processing, so we only set it * when module events are being processed, which would happen when * CREATE_TRACE_POINTS is defined. */ #ifdef CREATE_TRACE_POINTS #undef TRACE_SYSTEM #define TRACE_SYSTEM module #endif #if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MODULE_H #include <linux/tracepoint.h> #ifdef CONFIG_MODULES struct module; #define show_module_flags(flags) __print_flags(flags, "", \ { (1UL << TAINT_PROPRIETARY_MODULE), "P" }, \ { (1UL << TAINT_OOT_MODULE), "O" }, \ { (1UL << TAINT_FORCED_MODULE), "F" }, \ { (1UL << TAINT_CRAP), "C" }, \ { (1UL << TAINT_UNSIGNED_MODULE), "E" }) TRACE_EVENT(module_load, TP_PROTO(struct module *mod), TP_ARGS(mod), TP_STRUCT__entry( __field( unsigned int, taints ) __string( name, mod->name ) ), TP_fast_assign( __entry->taints = mod->taints; __assign_str(name); ), TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints)) ); TRACE_EVENT(module_free, TP_PROTO(struct module *mod), TP_ARGS(mod), TP_STRUCT__entry( __string( name, mod->name ) ), TP_fast_assign( __assign_str(name); ), TP_printk("%s", __get_str(name)) ); #ifdef CONFIG_MODULE_UNLOAD /* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */ DECLARE_EVENT_CLASS(module_refcnt, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip), TP_STRUCT__entry( __field( unsigned long, ip ) __field( int, refcnt ) __string( name, mod->name ) ), TP_fast_assign( __entry->ip = ip; __entry->refcnt = atomic_read(&mod->refcnt); __assign_str(name); ), TP_printk("%s call_site=%ps refcnt=%d", __get_str(name), (void *)__entry->ip, __entry->refcnt) ); DEFINE_EVENT(module_refcnt, module_get, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip) ); DEFINE_EVENT(module_refcnt, module_put, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip) ); #endif /* CONFIG_MODULE_UNLOAD */ TRACE_EVENT(module_request, TP_PROTO(char *name, bool wait, unsigned long ip), TP_ARGS(name, wait, ip), TP_STRUCT__entry( __field( unsigned long, ip ) __field( bool, wait ) __string( name, name ) ), TP_fast_assign( __entry->ip = ip; __entry->wait = wait; __assign_str(name); ), TP_printk("%s wait=%d call_site=%ps", __get_str(name), (int)__entry->wait, (void *)__entry->ip) ); #endif /* CONFIG_MODULES */ #endif /* _TRACE_MODULE_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
3 3 6 2 1 3 2 2 1 1 1 1 2 1 1 1 3 2 2 36 36 1 62 61 129 129 67 69 77 77 16 16 106 106 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NetLabel CALIPSO/IPv6 Support * * This file defines the CALIPSO/IPv6 functions for the NetLabel system. The * NetLabel system manages static and dynamic label mappings for network * protocols such as CIPSO and CALIPSO. * * Authors: Paul Moore <paul@paul-moore.com> * Huw Davies <huw@codeweavers.com> */ /* (c) Copyright Hewlett-Packard Development Company, L.P., 2006 * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015 */ #include <linux/types.h> #include <linux/socket.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/netlabel.h> #include <net/calipso.h> #include <linux/atomic.h> #include "netlabel_user.h" #include "netlabel_calipso.h" #include "netlabel_mgmt.h" #include "netlabel_domainhash.h" /* Argument struct for calipso_doi_walk() */ struct netlbl_calipso_doiwalk_arg { struct netlink_callback *nl_cb; struct sk_buff *skb; u32 seq; }; /* Argument struct for netlbl_domhsh_walk() */ struct netlbl_domhsh_walk_arg { struct netlbl_audit *audit_info; u32 doi; }; /* NetLabel Generic NETLINK CALIPSO family */ static struct genl_family netlbl_calipso_gnl_family; /* NetLabel Netlink attribute policy */ static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = { [NLBL_CALIPSO_A_DOI] = { .type = NLA_U32 }, [NLBL_CALIPSO_A_MTYPE] = { .type = NLA_U32 }, }; static const struct netlbl_calipso_ops *calipso_ops; /** * netlbl_calipso_ops_register - Register the CALIPSO operations * @ops: ops to register * * Description: * Register the CALIPSO packet engine operations. * */ const struct netlbl_calipso_ops * netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops) { return xchg(&calipso_ops, ops); } EXPORT_SYMBOL(netlbl_calipso_ops_register); static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void) { return READ_ONCE(calipso_ops); } /* NetLabel Command Handlers */ /** * netlbl_calipso_add_pass - Adds a CALIPSO pass DOI definition * @info: the Generic NETLINK info block * @audit_info: NetLabel audit information * * Description: * Create a new CALIPSO_MAP_PASS DOI definition based on the given ADD message * and add it to the CALIPSO engine. Return zero on success and non-zero on * error. * */ static int netlbl_calipso_add_pass(struct genl_info *info, struct netlbl_audit *audit_info) { int ret_val; struct calipso_doi *doi_def = NULL; doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); if (!doi_def) return -ENOMEM; doi_def->type = CALIPSO_MAP_PASS; doi_def->doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); ret_val = calipso_doi_add(doi_def, audit_info); if (ret_val != 0) calipso_doi_free(doi_def); return ret_val; } /** * netlbl_calipso_add - Handle an ADD message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Create a new DOI definition based on the given ADD message and add it to the * CALIPSO engine. Returns zero on success, negative values on failure. * */ static int netlbl_calipso_add(struct sk_buff *skb, struct genl_info *info) { int ret_val = -EINVAL; struct netlbl_audit audit_info; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (!info->attrs[NLBL_CALIPSO_A_DOI] || !info->attrs[NLBL_CALIPSO_A_MTYPE]) return -EINVAL; if (!ops) return -EOPNOTSUPP; netlbl_netlink_auditinfo(&audit_info); switch (nla_get_u32(info->attrs[NLBL_CALIPSO_A_MTYPE])) { case CALIPSO_MAP_PASS: ret_val = netlbl_calipso_add_pass(info, &audit_info); break; } if (ret_val == 0) atomic_inc(&netlabel_mgmt_protocount); return ret_val; } /** * netlbl_calipso_list - Handle a LIST message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated LIST message and respond accordingly. * Returns zero on success and negative values on error. * */ static int netlbl_calipso_list(struct sk_buff *skb, struct genl_info *info) { int ret_val; struct sk_buff *ans_skb = NULL; void *data; u32 doi; struct calipso_doi *doi_def; if (!info->attrs[NLBL_CALIPSO_A_DOI]) { ret_val = -EINVAL; goto list_failure; } doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); doi_def = calipso_doi_getdef(doi); if (!doi_def) { ret_val = -EINVAL; goto list_failure; } ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ans_skb) { ret_val = -ENOMEM; goto list_failure_put; } data = genlmsg_put_reply(ans_skb, info, &netlbl_calipso_gnl_family, 0, NLBL_CALIPSO_C_LIST); if (!data) { ret_val = -ENOMEM; goto list_failure_put; } ret_val = nla_put_u32(ans_skb, NLBL_CALIPSO_A_MTYPE, doi_def->type); if (ret_val != 0) goto list_failure_put; calipso_doi_putdef(doi_def); genlmsg_end(ans_skb, data); return genlmsg_reply(ans_skb, info); list_failure_put: calipso_doi_putdef(doi_def); list_failure: kfree_skb(ans_skb); return ret_val; } /** * netlbl_calipso_listall_cb - calipso_doi_walk() callback for LISTALL * @doi_def: the CALIPSO DOI definition * @arg: the netlbl_calipso_doiwalk_arg structure * * Description: * This function is designed to be used as a callback to the * calipso_doi_walk() function for use in generating a response for a LISTALL * message. Returns the size of the message on success, negative values on * failure. * */ static int netlbl_calipso_listall_cb(struct calipso_doi *doi_def, void *arg) { int ret_val = -ENOMEM; struct netlbl_calipso_doiwalk_arg *cb_arg = arg; void *data; data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, cb_arg->seq, &netlbl_calipso_gnl_family, NLM_F_MULTI, NLBL_CALIPSO_C_LISTALL); if (!data) goto listall_cb_failure; ret_val = nla_put_u32(cb_arg->skb, NLBL_CALIPSO_A_DOI, doi_def->doi); if (ret_val != 0) goto listall_cb_failure; ret_val = nla_put_u32(cb_arg->skb, NLBL_CALIPSO_A_MTYPE, doi_def->type); if (ret_val != 0) goto listall_cb_failure; genlmsg_end(cb_arg->skb, data); return 0; listall_cb_failure: genlmsg_cancel(cb_arg->skb, data); return ret_val; } /** * netlbl_calipso_listall - Handle a LISTALL message * @skb: the NETLINK buffer * @cb: the NETLINK callback * * Description: * Process a user generated LISTALL message and respond accordingly. Returns * zero on success and negative values on error. * */ static int netlbl_calipso_listall(struct sk_buff *skb, struct netlink_callback *cb) { struct netlbl_calipso_doiwalk_arg cb_arg; u32 doi_skip = cb->args[0]; cb_arg.nl_cb = cb; cb_arg.skb = skb; cb_arg.seq = cb->nlh->nlmsg_seq; calipso_doi_walk(&doi_skip, netlbl_calipso_listall_cb, &cb_arg); cb->args[0] = doi_skip; return skb->len; } /** * netlbl_calipso_remove_cb - netlbl_calipso_remove() callback for REMOVE * @entry: LSM domain mapping entry * @arg: the netlbl_domhsh_walk_arg structure * * Description: * This function is intended for use by netlbl_calipso_remove() as the callback * for the netlbl_domhsh_walk() function; it removes LSM domain map entries * which are associated with the CALIPSO DOI specified in @arg. Returns zero on * success, negative values on failure. * */ static int netlbl_calipso_remove_cb(struct netlbl_dom_map *entry, void *arg) { struct netlbl_domhsh_walk_arg *cb_arg = arg; if (entry->def.type == NETLBL_NLTYPE_CALIPSO && entry->def.calipso->doi == cb_arg->doi) return netlbl_domhsh_remove_entry(entry, cb_arg->audit_info); return 0; } /** * netlbl_calipso_remove - Handle a REMOVE message * @skb: the NETLINK buffer * @info: the Generic NETLINK info block * * Description: * Process a user generated REMOVE message and respond accordingly. Returns * zero on success, negative values on failure. * */ static int netlbl_calipso_remove(struct sk_buff *skb, struct genl_info *info) { int ret_val = -EINVAL; struct netlbl_domhsh_walk_arg cb_arg; struct netlbl_audit audit_info; u32 skip_bkt = 0; u32 skip_chain = 0; if (!info->attrs[NLBL_CALIPSO_A_DOI]) return -EINVAL; netlbl_netlink_auditinfo(&audit_info); cb_arg.doi = nla_get_u32(info->attrs[NLBL_CALIPSO_A_DOI]); cb_arg.audit_info = &audit_info; ret_val = netlbl_domhsh_walk(&skip_bkt, &skip_chain, netlbl_calipso_remove_cb, &cb_arg); if (ret_val == 0 || ret_val == -ENOENT) { ret_val = calipso_doi_remove(cb_arg.doi, &audit_info); if (ret_val == 0) atomic_dec(&netlabel_mgmt_protocount); } return ret_val; } /* NetLabel Generic NETLINK Command Definitions */ static const struct genl_small_ops netlbl_calipso_ops[] = { { .cmd = NLBL_CALIPSO_C_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_calipso_add, .dumpit = NULL, }, { .cmd = NLBL_CALIPSO_C_REMOVE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = netlbl_calipso_remove, .dumpit = NULL, }, { .cmd = NLBL_CALIPSO_C_LIST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = netlbl_calipso_list, .dumpit = NULL, }, { .cmd = NLBL_CALIPSO_C_LISTALL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, .doit = NULL, .dumpit = netlbl_calipso_listall, }, }; static struct genl_family netlbl_calipso_gnl_family __ro_after_init = { .hdrsize = 0, .name = NETLBL_NLTYPE_CALIPSO_NAME, .version = NETLBL_PROTO_VERSION, .maxattr = NLBL_CALIPSO_A_MAX, .policy = calipso_genl_policy, .module = THIS_MODULE, .small_ops = netlbl_calipso_ops, .n_small_ops = ARRAY_SIZE(netlbl_calipso_ops), .resv_start_op = NLBL_CALIPSO_C_LISTALL + 1, }; /* NetLabel Generic NETLINK Protocol Functions */ /** * netlbl_calipso_genl_init - Register the CALIPSO NetLabel component * * Description: * Register the CALIPSO packet NetLabel component with the Generic NETLINK * mechanism. Returns zero on success, negative values on failure. * */ int __init netlbl_calipso_genl_init(void) { return genl_register_family(&netlbl_calipso_gnl_family); } /** * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine * @doi_def: the DOI structure * @audit_info: NetLabel audit information * * Description: * The caller defines a new DOI for use by the CALIPSO engine and calls this * function to add it to the list of acceptable domains. The caller must * ensure that the mapping table specified in @doi_def->map meets all of the * requirements of the mapping type (see calipso.h for details). Returns * zero on success and non-zero on failure. * */ int calipso_doi_add(struct calipso_doi *doi_def, struct netlbl_audit *audit_info) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->doi_add(doi_def, audit_info); return ret_val; } /** * calipso_doi_free - Frees a DOI definition * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. * */ void calipso_doi_free(struct calipso_doi *doi_def) { const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ops->doi_free(doi_def); } /** * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine * @doi: the DOI value * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will * be called to release their own LSM domain mappings as well as our own * domain list. Returns zero on success and negative values on failure. * */ int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->doi_remove(doi, audit_info); return ret_val; } /** * calipso_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value * * Description: * Searches for a valid DOI definition and if one is found it is returned to * the caller. Otherwise NULL is returned. The caller must ensure that * calipso_doi_putdef() is called when the caller is done. * */ struct calipso_doi *calipso_doi_getdef(u32 doi) { struct calipso_doi *ret_val = NULL; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->doi_getdef(doi); return ret_val; } /** * calipso_doi_putdef - Releases a reference for the given DOI definition * @doi_def: the DOI definition * * Description: * Releases a DOI definition reference obtained from calipso_doi_getdef(). * */ void calipso_doi_putdef(struct calipso_doi *doi_def) { const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ops->doi_putdef(doi_def); } /** * calipso_doi_walk - Iterate through the DOI definitions * @skip_cnt: skip past this number of DOI definitions, updated * @callback: callback for each DOI definition * @cb_arg: argument for the callback function * * Description: * Iterate over the DOI definition list, skipping the first @skip_cnt entries. * For each entry call @callback, if @callback returns a negative value stop * 'walking' through the list and return. Updates the value in @skip_cnt upon * return. Returns zero on success, negative values on failure. * */ int calipso_doi_walk(u32 *skip_cnt, int (*callback)(struct calipso_doi *doi_def, void *arg), void *cb_arg) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->doi_walk(skip_cnt, callback, cb_arg); return ret_val; } /** * calipso_sock_getattr - Get the security attributes from a sock * @sk: the sock * @secattr: the security attributes * * Description: * Query @sk to see if there is a CALIPSO option attached to the sock and if * there is return the CALIPSO security attributes in @secattr. This function * requires that @sk be locked, or privately held, but it does not do any * locking itself. Returns zero on success and negative values on failure. * */ int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->sock_getattr(sk, secattr); return ret_val; } /** * calipso_sock_setattr - Add a CALIPSO option to a socket * @sk: the socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. This function requires * exclusive access to @sk, which means it either needs to be in the * process of being created or locked. Returns zero on success and negative * values on failure. * */ int calipso_sock_setattr(struct sock *sk, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->sock_setattr(sk, doi_def, secattr); return ret_val; } /** * calipso_sock_delattr - Delete the CALIPSO option from a socket * @sk: the socket * * Description: * Removes the CALIPSO option from a socket, if present. * */ void calipso_sock_delattr(struct sock *sk) { const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ops->sock_delattr(sk); } /** * calipso_req_setattr - Add a CALIPSO option to a connection request socket * @req: the connection request socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. Returns zero on success and * negative values on failure. * */ int calipso_req_setattr(struct request_sock *req, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->req_setattr(req, doi_def, secattr); return ret_val; } /** * calipso_req_delattr - Delete the CALIPSO option from a request socket * @req: the request socket * * Description: * Removes the CALIPSO option from a request socket, if present. * */ void calipso_req_delattr(struct request_sock *req) { const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ops->req_delattr(req); } /** * calipso_optptr - Find the CALIPSO option in the packet * @skb: the packet * * Description: * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer * to the start of the CALIPSO option on success, NULL if one if not found. * */ unsigned char *calipso_optptr(const struct sk_buff *skb) { unsigned char *ret_val = NULL; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->skbuff_optptr(skb); return ret_val; } /** * calipso_getattr - Get the security attributes from a memory block. * @calipso: the CALIPSO option * @secattr: the security attributes * * Description: * Inspect @calipso and return the security attributes in @secattr. * Returns zero on success and negative values on failure. * */ int calipso_getattr(const unsigned char *calipso, struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->opt_getattr(calipso, secattr); return ret_val; } /** * calipso_skbuff_setattr - Set the CALIPSO option on a packet * @skb: the packet * @doi_def: the CALIPSO DOI to use * @secattr: the security attributes * * Description: * Set the CALIPSO option on the given packet based on the security attributes. * Returns a pointer to the IP header on success and NULL on failure. * */ int calipso_skbuff_setattr(struct sk_buff *skb, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->skbuff_setattr(skb, doi_def, secattr); return ret_val; } /** * calipso_skbuff_delattr - Delete any CALIPSO options from a packet * @skb: the packet * * Description: * Removes any and all CALIPSO options from the given packet. Returns zero on * success, negative values on failure. * */ int calipso_skbuff_delattr(struct sk_buff *skb) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->skbuff_delattr(skb); return ret_val; } /** * calipso_cache_invalidate - Invalidates the current CALIPSO cache * * Description: * Invalidates and frees any entries in the CALIPSO cache. Returns zero on * success and negative values on failure. * */ void calipso_cache_invalidate(void) { const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ops->cache_invalidate(); } /** * calipso_cache_add - Add an entry to the CALIPSO cache * @calipso_ptr: the CALIPSO option * @secattr: the packet's security attributes * * Description: * Add a new entry into the CALIPSO label mapping cache. * Returns zero on success, negative values on failure. * */ int calipso_cache_add(const unsigned char *calipso_ptr, const struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; const struct netlbl_calipso_ops *ops = netlbl_calipso_ops_get(); if (ops) ret_val = ops->cache_add(calipso_ptr, secattr); return ret_val; } |
16 16 16 16 1 9 16 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Scatterlist Cryptographic API. * * Procfs information. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> */ #include <linux/atomic.h> #include <linux/init.h> #include <linux/crypto.h> #include <linux/fips.h> #include <linux/module.h> /* for module_name() */ #include <linux/rwsem.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include "internal.h" static void *c_start(struct seq_file *m, loff_t *pos) { down_read(&crypto_alg_sem); return seq_list_start(&crypto_alg_list, *pos); } static void *c_next(struct seq_file *m, void *p, loff_t *pos) { return seq_list_next(p, &crypto_alg_list, pos); } static void c_stop(struct seq_file *m, void *p) { up_read(&crypto_alg_sem); } static int c_show(struct seq_file *m, void *p) { struct crypto_alg *alg = list_entry(p, struct crypto_alg, cra_list); seq_printf(m, "name : %s\n", alg->cra_name); seq_printf(m, "driver : %s\n", alg->cra_driver_name); seq_printf(m, "module : %s\n", module_name(alg->cra_module)); seq_printf(m, "priority : %d\n", alg->cra_priority); seq_printf(m, "refcnt : %u\n", refcount_read(&alg->cra_refcnt)); seq_printf(m, "selftest : %s\n", (alg->cra_flags & CRYPTO_ALG_TESTED) ? "passed" : "unknown"); seq_printf(m, "internal : %s\n", str_yes_no(alg->cra_flags & CRYPTO_ALG_INTERNAL)); if (fips_enabled) seq_printf(m, "fips : %s\n", str_no_yes(alg->cra_flags & CRYPTO_ALG_FIPS_INTERNAL)); if (alg->cra_flags & CRYPTO_ALG_LARVAL) { seq_printf(m, "type : larval\n"); seq_printf(m, "flags : 0x%x\n", alg->cra_flags); goto out; } if (alg->cra_type && alg->cra_type->show) { alg->cra_type->show(m, alg); goto out; } switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) { case CRYPTO_ALG_TYPE_CIPHER: seq_printf(m, "type : cipher\n"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "min keysize : %u\n", alg->cra_cipher.cia_min_keysize); seq_printf(m, "max keysize : %u\n", alg->cra_cipher.cia_max_keysize); break; default: seq_printf(m, "type : unknown\n"); break; } out: seq_putc(m, '\n'); return 0; } static const struct seq_operations crypto_seq_ops = { .start = c_start, .next = c_next, .stop = c_stop, .show = c_show }; void __init crypto_init_proc(void) { proc_create_seq("crypto", 0, NULL, &crypto_seq_ops); } void __exit crypto_exit_proc(void) { remove_proc_entry("crypto", NULL); } |
2 10 4 1 1 2 8 1 2 12 20 2 4 14 14 1 1 1 1 1 9 1 1 7 3 6 1 5 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 | // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2012 Pablo Neira Ayuso <pablo@netfilter.org> * * This software has been sponsored by Vyatta Inc. <http://www.vyatta.com> */ #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/list.h> #include <linux/errno.h> #include <linux/capability.h> #include <net/netlink.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> #include <linux/netfilter/nfnetlink_cthelper.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("nfnl_cthelper: User-space connection tracking helpers"); struct nfnl_cthelper { struct list_head list; struct nf_conntrack_helper helper; }; static LIST_HEAD(nfnl_cthelper_list); static int nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { const struct nf_conn_help *help; struct nf_conntrack_helper *helper; help = nfct_help(ct); if (help == NULL) return NF_DROP; /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); if (helper == NULL) return NF_DROP; /* This is a user-space helper not yet configured, skip. */ if ((helper->flags & (NF_CT_HELPER_F_USERSPACE | NF_CT_HELPER_F_CONFIGURED)) == NF_CT_HELPER_F_USERSPACE) return NF_ACCEPT; /* If the user-space helper is not available, don't block traffic. */ return NF_QUEUE_NR(helper->queue_num) | NF_VERDICT_FLAG_QUEUE_BYPASS; } static const struct nla_policy nfnl_cthelper_tuple_pol[NFCTH_TUPLE_MAX+1] = { [NFCTH_TUPLE_L3PROTONUM] = { .type = NLA_U16, }, [NFCTH_TUPLE_L4PROTONUM] = { .type = NLA_U8, }, }; static int nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple, const struct nlattr *attr) { int err; struct nlattr *tb[NFCTH_TUPLE_MAX+1]; err = nla_parse_nested_deprecated(tb, NFCTH_TUPLE_MAX, attr, nfnl_cthelper_tuple_pol, NULL); if (err < 0) return err; if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM]) return -EINVAL; /* Not all fields are initialized so first zero the tuple */ memset(tuple, 0, sizeof(struct nf_conntrack_tuple)); tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM])); tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]); return 0; } static int nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); const struct nf_conntrack_helper *helper; if (attr == NULL) return -EINVAL; helper = rcu_dereference(help->helper); if (!helper || helper->data_len == 0) return -EINVAL; nla_memcpy(help->data, attr, sizeof(help->data)); return 0; } static int nfnl_cthelper_to_nlattr(struct sk_buff *skb, const struct nf_conn *ct) { const struct nf_conn_help *help = nfct_help(ct); const struct nf_conntrack_helper *helper; helper = rcu_dereference(help->helper); if (helper && helper->data_len && nla_put(skb, CTA_HELP_INFO, helper->data_len, &help->data)) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy nfnl_cthelper_expect_pol[NFCTH_POLICY_MAX+1] = { [NFCTH_POLICY_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN-1 }, [NFCTH_POLICY_EXPECT_MAX] = { .type = NLA_U32, }, [NFCTH_POLICY_EXPECT_TIMEOUT] = { .type = NLA_U32, }, }; static int nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, const struct nlattr *attr) { int err; struct nlattr *tb[NFCTH_POLICY_MAX+1]; err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol, NULL); if (err < 0) return err; if (!tb[NFCTH_POLICY_NAME] || !tb[NFCTH_POLICY_EXPECT_MAX] || !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) return -EINVAL; nla_strscpy(expect_policy->name, tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN); expect_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); if (expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; expect_policy->timeout = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT])); return 0; } static const struct nla_policy nfnl_cthelper_expect_policy_set[NFCTH_POLICY_SET_MAX+1] = { [NFCTH_POLICY_SET_NUM] = { .type = NLA_U32, }, }; static int nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, const struct nlattr *attr) { int i, ret; struct nf_conntrack_expect_policy *expect_policy; struct nlattr *tb[NFCTH_POLICY_SET_MAX+1]; unsigned int class_max; ret = nla_parse_nested_deprecated(tb, NFCTH_POLICY_SET_MAX, attr, nfnl_cthelper_expect_policy_set, NULL); if (ret < 0) return ret; if (!tb[NFCTH_POLICY_SET_NUM]) return -EINVAL; class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); if (class_max == 0) return -EINVAL; if (class_max > NF_CT_MAX_EXPECT_CLASSES) return -EOVERFLOW; expect_policy = kcalloc(class_max, sizeof(struct nf_conntrack_expect_policy), GFP_KERNEL); if (expect_policy == NULL) return -ENOMEM; for (i = 0; i < class_max; i++) { if (!tb[NFCTH_POLICY_SET+i]) goto err; ret = nfnl_cthelper_expect_policy(&expect_policy[i], tb[NFCTH_POLICY_SET+i]); if (ret < 0) goto err; } helper->expect_class_max = class_max - 1; helper->expect_policy = expect_policy; return 0; err: kfree(expect_policy); return -EINVAL; } static int nfnl_cthelper_create(const struct nlattr * const tb[], struct nf_conntrack_tuple *tuple) { struct nf_conntrack_helper *helper; struct nfnl_cthelper *nfcth; unsigned int size; int ret; if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN]) return -EINVAL; nfcth = kzalloc(sizeof(*nfcth), GFP_KERNEL); if (nfcth == NULL) return -ENOMEM; helper = &nfcth->helper; ret = nfnl_cthelper_parse_expect_policy(helper, tb[NFCTH_POLICY]); if (ret < 0) goto err1; nla_strscpy(helper->name, tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN); size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); if (size > sizeof_field(struct nf_conn_help, data)) { ret = -ENOMEM; goto err2; } helper->data_len = size; helper->flags |= NF_CT_HELPER_F_USERSPACE; memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple)); helper->me = THIS_MODULE; helper->help = nfnl_userspace_cthelper; helper->from_nlattr = nfnl_cthelper_from_nlattr; helper->to_nlattr = nfnl_cthelper_to_nlattr; /* Default to queue number zero, this can be updated at any time. */ if (tb[NFCTH_QUEUE_NUM]) helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM])); if (tb[NFCTH_STATUS]) { int status = ntohl(nla_get_be32(tb[NFCTH_STATUS])); switch(status) { case NFCT_HELPER_STATUS_ENABLED: helper->flags |= NF_CT_HELPER_F_CONFIGURED; break; case NFCT_HELPER_STATUS_DISABLED: helper->flags &= ~NF_CT_HELPER_F_CONFIGURED; break; } } ret = nf_conntrack_helper_register(helper); if (ret < 0) goto err2; list_add_tail(&nfcth->list, &nfnl_cthelper_list); return 0; err2: kfree(helper->expect_policy); err1: kfree(nfcth); return ret; } static int nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy, struct nf_conntrack_expect_policy *new_policy, const struct nlattr *attr) { struct nlattr *tb[NFCTH_POLICY_MAX + 1]; int err; err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol, NULL); if (err < 0) return err; if (!tb[NFCTH_POLICY_NAME] || !tb[NFCTH_POLICY_EXPECT_MAX] || !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) return -EINVAL; if (nla_strcmp(tb[NFCTH_POLICY_NAME], policy->name)) return -EBUSY; new_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); if (new_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; new_policy->timeout = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT])); return 0; } static int nfnl_cthelper_update_policy_all(struct nlattr *tb[], struct nf_conntrack_helper *helper) { struct nf_conntrack_expect_policy *new_policy; struct nf_conntrack_expect_policy *policy; int i, ret = 0; new_policy = kmalloc_array(helper->expect_class_max + 1, sizeof(*new_policy), GFP_KERNEL); if (!new_policy) return -ENOMEM; /* Check first that all policy attributes are well-formed, so we don't * leave things in inconsistent state on errors. */ for (i = 0; i < helper->expect_class_max + 1; i++) { if (!tb[NFCTH_POLICY_SET + i]) { ret = -EINVAL; goto err; } ret = nfnl_cthelper_update_policy_one(&helper->expect_policy[i], &new_policy[i], tb[NFCTH_POLICY_SET + i]); if (ret < 0) goto err; } /* Now we can safely update them. */ for (i = 0; i < helper->expect_class_max + 1; i++) { policy = (struct nf_conntrack_expect_policy *) &helper->expect_policy[i]; policy->max_expected = new_policy->max_expected; policy->timeout = new_policy->timeout; } err: kfree(new_policy); return ret; } static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper, const struct nlattr *attr) { struct nlattr *tb[NFCTH_POLICY_SET_MAX + 1]; unsigned int class_max; int err; err = nla_parse_nested_deprecated(tb, NFCTH_POLICY_SET_MAX, attr, nfnl_cthelper_expect_policy_set, NULL); if (err < 0) return err; if (!tb[NFCTH_POLICY_SET_NUM]) return -EINVAL; class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); if (helper->expect_class_max + 1 != class_max) return -EBUSY; return nfnl_cthelper_update_policy_all(tb, helper); } static int nfnl_cthelper_update(const struct nlattr * const tb[], struct nf_conntrack_helper *helper) { u32 size; int ret; if (tb[NFCTH_PRIV_DATA_LEN]) { size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); if (size != helper->data_len) return -EBUSY; } if (tb[NFCTH_POLICY]) { ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]); if (ret < 0) return ret; } if (tb[NFCTH_QUEUE_NUM]) helper->queue_num = ntohl(nla_get_be32(tb[NFCTH_QUEUE_NUM])); if (tb[NFCTH_STATUS]) { int status = ntohl(nla_get_be32(tb[NFCTH_STATUS])); switch(status) { case NFCT_HELPER_STATUS_ENABLED: helper->flags |= NF_CT_HELPER_F_CONFIGURED; break; case NFCT_HELPER_STATUS_DISABLED: helper->flags &= ~NF_CT_HELPER_F_CONFIGURED; break; } } return 0; } static int nfnl_cthelper_new(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { const char *helper_name; struct nf_conntrack_helper *cur, *helper = NULL; struct nf_conntrack_tuple tuple; struct nfnl_cthelper *nlcth; int ret = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!tb[NFCTH_NAME] || !tb[NFCTH_TUPLE]) return -EINVAL; helper_name = nla_data(tb[NFCTH_NAME]); ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); if (ret < 0) return ret; list_for_each_entry(nlcth, &nfnl_cthelper_list, list) { cur = &nlcth->helper; if (strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN)) continue; if ((tuple.src.l3num != cur->tuple.src.l3num || tuple.dst.protonum != cur->tuple.dst.protonum)) continue; if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; helper = cur; break; } if (helper == NULL) ret = nfnl_cthelper_create(tb, &tuple); else ret = nfnl_cthelper_update(tb, helper); return ret; } static int nfnl_cthelper_dump_tuple(struct sk_buff *skb, struct nf_conntrack_helper *helper) { struct nlattr *nest_parms; nest_parms = nla_nest_start(skb, NFCTH_TUPLE); if (nest_parms == NULL) goto nla_put_failure; if (nla_put_be16(skb, NFCTH_TUPLE_L3PROTONUM, htons(helper->tuple.src.l3num))) goto nla_put_failure; if (nla_put_u8(skb, NFCTH_TUPLE_L4PROTONUM, helper->tuple.dst.protonum)) goto nla_put_failure; nla_nest_end(skb, nest_parms); return 0; nla_put_failure: return -1; } static int nfnl_cthelper_dump_policy(struct sk_buff *skb, struct nf_conntrack_helper *helper) { int i; struct nlattr *nest_parms1, *nest_parms2; nest_parms1 = nla_nest_start(skb, NFCTH_POLICY); if (nest_parms1 == NULL) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM, htonl(helper->expect_class_max + 1))) goto nla_put_failure; for (i = 0; i < helper->expect_class_max + 1; i++) { nest_parms2 = nla_nest_start(skb, (NFCTH_POLICY_SET + i)); if (nest_parms2 == NULL) goto nla_put_failure; if (nla_put_string(skb, NFCTH_POLICY_NAME, helper->expect_policy[i].name)) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_MAX, htonl(helper->expect_policy[i].max_expected))) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_POLICY_EXPECT_TIMEOUT, htonl(helper->expect_policy[i].timeout))) goto nla_put_failure; nla_nest_end(skb, nest_parms2); } nla_nest_end(skb, nest_parms1); return 0; nla_put_failure: return -1; } static int nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_conntrack_helper *helper) { struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0; int status; event = nfnl_msg_type(NFNL_SUBSYS_CTHELPER, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, NFNETLINK_V0, 0); if (!nlh) goto nlmsg_failure; if (nla_put_string(skb, NFCTH_NAME, helper->name)) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_QUEUE_NUM, htonl(helper->queue_num))) goto nla_put_failure; if (nfnl_cthelper_dump_tuple(skb, helper) < 0) goto nla_put_failure; if (nfnl_cthelper_dump_policy(skb, helper) < 0) goto nla_put_failure; if (nla_put_be32(skb, NFCTH_PRIV_DATA_LEN, htonl(helper->data_len))) goto nla_put_failure; if (helper->flags & NF_CT_HELPER_F_CONFIGURED) status = NFCT_HELPER_STATUS_ENABLED; else status = NFCT_HELPER_STATUS_DISABLED; if (nla_put_be32(skb, NFCTH_STATUS, htonl(status))) goto nla_put_failure; nlmsg_end(skb, nlh); return skb->len; nlmsg_failure: nla_put_failure: nlmsg_cancel(skb, nlh); return -1; } static int nfnl_cthelper_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct nf_conntrack_helper *cur, *last; rcu_read_lock(); last = (struct nf_conntrack_helper *)cb->args[1]; for (; cb->args[0] < nf_ct_helper_hsize; cb->args[0]++) { restart: hlist_for_each_entry_rcu(cur, &nf_ct_helper_hash[cb->args[0]], hnode) { /* skip non-userspace conntrack helpers. */ if (!(cur->flags & NF_CT_HELPER_F_USERSPACE)) continue; if (cb->args[1]) { if (cur != last) continue; cb->args[1] = 0; } if (nfnl_cthelper_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), NFNL_MSG_CTHELPER_NEW, cur) < 0) { cb->args[1] = (unsigned long)cur; goto out; } } } if (cb->args[1]) { cb->args[1] = 0; goto restart; } out: rcu_read_unlock(); return skb->len; } static int nfnl_cthelper_get(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { int ret = -ENOENT; struct nf_conntrack_helper *cur; struct sk_buff *skb2; char *helper_name = NULL; struct nf_conntrack_tuple tuple; struct nfnl_cthelper *nlcth; bool tuple_set = false; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_cthelper_dump_table, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (tb[NFCTH_NAME]) helper_name = nla_data(tb[NFCTH_NAME]); if (tb[NFCTH_TUPLE]) { ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); if (ret < 0) return ret; tuple_set = true; } list_for_each_entry(nlcth, &nfnl_cthelper_list, list) { cur = &nlcth->helper; if (helper_name && strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN)) continue; if (tuple_set && (tuple.src.l3num != cur->tuple.src.l3num || tuple.dst.protonum != cur->tuple.dst.protonum)) continue; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb2 == NULL) { ret = -ENOMEM; break; } ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_CTHELPER_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); break; } return ret; } static int nfnl_cthelper_del(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const tb[]) { char *helper_name = NULL; struct nf_conntrack_helper *cur; struct nf_conntrack_tuple tuple; bool tuple_set = false, found = false; struct nfnl_cthelper *nlcth, *n; int j = 0, ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (tb[NFCTH_NAME]) helper_name = nla_data(tb[NFCTH_NAME]); if (tb[NFCTH_TUPLE]) { ret = nfnl_cthelper_parse_tuple(&tuple, tb[NFCTH_TUPLE]); if (ret < 0) return ret; tuple_set = true; } ret = -ENOENT; list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) { cur = &nlcth->helper; j++; if (helper_name && strncmp(cur->name, helper_name, NF_CT_HELPER_NAME_LEN)) continue; if (tuple_set && (tuple.src.l3num != cur->tuple.src.l3num || tuple.dst.protonum != cur->tuple.dst.protonum)) continue; if (refcount_dec_if_one(&cur->refcnt)) { found = true; nf_conntrack_helper_unregister(cur); kfree(cur->expect_policy); list_del(&nlcth->list); kfree(nlcth); } else { ret = -EBUSY; } } /* Make sure we return success if we flush and there is no helpers */ return (found || j == 0) ? 0 : ret; } static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { [NFCTH_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN-1 }, [NFCTH_QUEUE_NUM] = { .type = NLA_U32, }, [NFCTH_PRIV_DATA_LEN] = { .type = NLA_U32, }, [NFCTH_STATUS] = { .type = NLA_U32, }, }; static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = { [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new, .type = NFNL_CB_MUTEX, .attr_count = NFCTH_MAX, .policy = nfnl_cthelper_policy }, [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get, .type = NFNL_CB_MUTEX, .attr_count = NFCTH_MAX, .policy = nfnl_cthelper_policy }, [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del, .type = NFNL_CB_MUTEX, .attr_count = NFCTH_MAX, .policy = nfnl_cthelper_policy }, }; static const struct nfnetlink_subsystem nfnl_cthelper_subsys = { .name = "cthelper", .subsys_id = NFNL_SUBSYS_CTHELPER, .cb_count = NFNL_MSG_CTHELPER_MAX, .cb = nfnl_cthelper_cb, }; MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTHELPER); static int __init nfnl_cthelper_init(void) { int ret; ret = nfnetlink_subsys_register(&nfnl_cthelper_subsys); if (ret < 0) { pr_err("nfnl_cthelper: cannot register with nfnetlink.\n"); goto err_out; } return 0; err_out: return ret; } static void __exit nfnl_cthelper_exit(void) { struct nf_conntrack_helper *cur; struct nfnl_cthelper *nlcth, *n; nfnetlink_subsys_unregister(&nfnl_cthelper_subsys); list_for_each_entry_safe(nlcth, n, &nfnl_cthelper_list, list) { cur = &nlcth->helper; nf_conntrack_helper_unregister(cur); kfree(cur->expect_policy); kfree(nlcth); } } module_init(nfnl_cthelper_init); module_exit(nfnl_cthelper_exit); |
24 2 24 22 5 24 24 24 5 20 23 5 20 5 20 5 20 5 20 5 20 5 19 5 20 5 20 5 20 5 20 5 20 5 20 5 20 5 20 5 19 5 20 5 20 5 20 5 20 5 20 5 23 23 1 22 21 22 21 20 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 20 2 21 20 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/dim.h> #include "netlink.h" #include "common.h" struct coalesce_req_info { struct ethnl_req_info base; }; struct coalesce_reply_data { struct ethnl_reply_data base; struct ethtool_coalesce coalesce; struct kernel_ethtool_coalesce kernel_coalesce; u32 supported_params; }; #define COALESCE_REPDATA(__reply_base) \ container_of(__reply_base, struct coalesce_reply_data, base) #define __SUPPORTED_OFFSET ETHTOOL_A_COALESCE_RX_USECS static u32 attr_to_mask(unsigned int attr_type) { return BIT(attr_type - __SUPPORTED_OFFSET); } /* build time check that indices in ethtool_ops::supported_coalesce_params * match corresponding attribute types with an offset */ #define __CHECK_SUPPORTED_OFFSET(x) \ static_assert((ETHTOOL_ ## x) == \ BIT((ETHTOOL_A_ ## x) - __SUPPORTED_OFFSET)) __CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_IRQ); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_IRQ); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_IRQ); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_IRQ); __CHECK_SUPPORTED_OFFSET(COALESCE_STATS_BLOCK_USECS); __CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_RX); __CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_TX); __CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_LOW); __CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_HIGH); __CHECK_SUPPORTED_OFFSET(COALESCE_RATE_SAMPLE_INTERVAL); const struct nla_policy ethnl_coalesce_get_policy[] = { [ETHTOOL_A_COALESCE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int coalesce_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; data->supported_params = dev->ethtool_ops->supported_coalesce_params; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce, &data->kernel_coalesce, info->extack); ethnl_ops_complete(dev); return ret; } static int coalesce_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { int modersz = nla_total_size(0) + /* _PROFILE_IRQ_MODERATION, nest */ nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_USEC */ nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_PKTS */ nla_total_size(sizeof(u32)); /* _IRQ_MODERATION_COMPS */ int total_modersz = nla_total_size(0) + /* _{R,T}X_PROFILE, nest */ modersz * NET_DIM_PARAMS_NUM_PROFILES; return nla_total_size(sizeof(u32)) + /* _RX_USECS */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES */ nla_total_size(sizeof(u32)) + /* _RX_USECS_IRQ */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_IRQ */ nla_total_size(sizeof(u32)) + /* _TX_USECS */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES */ nla_total_size(sizeof(u32)) + /* _TX_USECS_IRQ */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_IRQ */ nla_total_size(sizeof(u32)) + /* _STATS_BLOCK_USECS */ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_RX */ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_TX */ nla_total_size(sizeof(u32)) + /* _PKT_RATE_LOW */ nla_total_size(sizeof(u32)) + /* _RX_USECS_LOW */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_LOW */ nla_total_size(sizeof(u32)) + /* _TX_USECS_LOW */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_LOW */ nla_total_size(sizeof(u32)) + /* _PKT_RATE_HIGH */ nla_total_size(sizeof(u32)) + /* _RX_USECS_HIGH */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_HIGH */ nla_total_size(sizeof(u32)) + /* _TX_USECS_HIGH */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */ nla_total_size(sizeof(u32)) + /* _RATE_SAMPLE_INTERVAL */ nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_TX */ nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_RX */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_BYTES */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_FRAMES */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_TIME_USECS */ total_modersz * 2; /* _{R,T}X_PROFILE */ } static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val, u32 supported_params) { if (!val && !(supported_params & attr_to_mask(attr_type))) return false; return nla_put_u32(skb, attr_type, val); } static bool coalesce_put_bool(struct sk_buff *skb, u16 attr_type, u32 val, u32 supported_params) { if (!val && !(supported_params & attr_to_mask(attr_type))) return false; return nla_put_u8(skb, attr_type, !!val); } /** * coalesce_put_profile - fill reply with a nla nest with four child nla nests. * @skb: socket buffer the message is stored in * @attr_type: nest attr type ETHTOOL_A_COALESCE_*X_PROFILE * @profile: data passed to userspace * @coal_flags: modifiable parameters supported by the driver * * Put a dim profile nest attribute. Refer to ETHTOOL_A_PROFILE_IRQ_MODERATION. * * Return: 0 on success or a negative error code. */ static int coalesce_put_profile(struct sk_buff *skb, u16 attr_type, const struct dim_cq_moder *profile, u8 coal_flags) { struct nlattr *profile_attr, *moder_attr; int i, ret; if (!profile || !coal_flags) return 0; profile_attr = nla_nest_start(skb, attr_type); if (!profile_attr) return -EMSGSIZE; for (i = 0; i < NET_DIM_PARAMS_NUM_PROFILES; i++) { moder_attr = nla_nest_start(skb, ETHTOOL_A_PROFILE_IRQ_MODERATION); if (!moder_attr) { ret = -EMSGSIZE; goto cancel_profile; } if (coal_flags & DIM_COALESCE_USEC) { ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_USEC, profile[i].usec); if (ret) goto cancel_moder; } if (coal_flags & DIM_COALESCE_PKTS) { ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_PKTS, profile[i].pkts); if (ret) goto cancel_moder; } if (coal_flags & DIM_COALESCE_COMPS) { ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_COMPS, profile[i].comps); if (ret) goto cancel_moder; } nla_nest_end(skb, moder_attr); } nla_nest_end(skb, profile_attr); return 0; cancel_moder: nla_nest_cancel(skb, moder_attr); cancel_profile: nla_nest_cancel(skb, profile_attr); return ret; } static int coalesce_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); const struct kernel_ethtool_coalesce *kcoal = &data->kernel_coalesce; const struct ethtool_coalesce *coal = &data->coalesce; u32 supported = data->supported_params; struct dim_irq_moder *moder; int ret = 0; if (coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS, coal->rx_coalesce_usecs, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES, coal->rx_max_coalesced_frames, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_IRQ, coal->rx_coalesce_usecs_irq, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, coal->rx_max_coalesced_frames_irq, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS, coal->tx_coalesce_usecs, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES, coal->tx_max_coalesced_frames, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_IRQ, coal->tx_coalesce_usecs_irq, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, coal->tx_max_coalesced_frames_irq, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, coal->stats_block_coalesce_usecs, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, coal->use_adaptive_rx_coalesce, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, coal->use_adaptive_tx_coalesce, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_LOW, coal->pkt_rate_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_LOW, coal->rx_coalesce_usecs_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, coal->rx_max_coalesced_frames_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_LOW, coal->tx_coalesce_usecs_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, coal->tx_max_coalesced_frames_low, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_HIGH, coal->pkt_rate_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_HIGH, coal->rx_coalesce_usecs_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, coal->rx_max_coalesced_frames_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_HIGH, coal->tx_coalesce_usecs_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, coal->tx_max_coalesced_frames_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, coal->rate_sample_interval, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, kcoal->use_cqe_mode_tx, supported) || coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, kcoal->use_cqe_mode_rx, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, kcoal->tx_aggr_max_bytes, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, kcoal->tx_aggr_max_frames, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, kcoal->tx_aggr_time_usecs, supported)) return -EMSGSIZE; if (!req_base->dev || !req_base->dev->irq_moder) return 0; moder = req_base->dev->irq_moder; rcu_read_lock(); if (moder->profile_flags & DIM_PROFILE_RX) { ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_RX_PROFILE, rcu_dereference(moder->rx_profile), moder->coal_flags); if (ret) goto out; } if (moder->profile_flags & DIM_PROFILE_TX) ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_TX_PROFILE, rcu_dereference(moder->tx_profile), moder->coal_flags); out: rcu_read_unlock(); return ret; } /* COALESCE_SET */ static const struct nla_policy coalesce_irq_moderation_policy[] = { [ETHTOOL_A_IRQ_MODERATION_USEC] = { .type = NLA_U32 }, [ETHTOOL_A_IRQ_MODERATION_PKTS] = { .type = NLA_U32 }, [ETHTOOL_A_IRQ_MODERATION_COMPS] = { .type = NLA_U32 }, }; static const struct nla_policy coalesce_profile_policy[] = { [ETHTOOL_A_PROFILE_IRQ_MODERATION] = NLA_POLICY_NESTED(coalesce_irq_moderation_policy), }; const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_COALESCE_RX_USECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_MAX_FRAMES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_USECS_IRQ] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_USECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_USECS_IRQ] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_STATS_BLOCK_USECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX] = { .type = NLA_U8 }, [ETHTOOL_A_COALESCE_PKT_RATE_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_USECS_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_USECS_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_PKT_RATE_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_USECS_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_USE_CQE_MODE_TX] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_COALESCE_USE_CQE_MODE_RX] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_PROFILE] = NLA_POLICY_NESTED(coalesce_profile_policy), [ETHTOOL_A_COALESCE_TX_PROFILE] = NLA_POLICY_NESTED(coalesce_profile_policy), }; static int ethnl_set_coalesce_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct dim_irq_moder *irq_moder = req_info->dev->irq_moder; struct nlattr **tb = info->attrs; u32 supported_params; u16 a; if (!ops->get_coalesce || !ops->set_coalesce) return -EOPNOTSUPP; /* make sure that only supported parameters are present */ supported_params = ops->supported_coalesce_params; if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_RX) supported_params |= ETHTOOL_COALESCE_RX_PROFILE; if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_TX) supported_params |= ETHTOOL_COALESCE_TX_PROFILE; for (a = ETHTOOL_A_COALESCE_RX_USECS; a < __ETHTOOL_A_COALESCE_CNT; a++) if (tb[a] && !(supported_params & attr_to_mask(a))) { NL_SET_ERR_MSG_ATTR(info->extack, tb[a], "cannot modify an unsupported parameter"); return -EINVAL; } return 1; } /** * ethnl_update_irq_moder - update a specific field in the given profile * @irq_moder: place that collects dim related information * @irq_field: field in profile to modify * @attr_type: attr type ETHTOOL_A_IRQ_MODERATION_* * @tb: netlink attribute with new values or null * @coal_bit: DIM_COALESCE_* bit from coal_flags * @mod: pointer to bool for modification tracking * @extack: netlink extended ack * * Return: 0 on success or a negative error code. */ static int ethnl_update_irq_moder(struct dim_irq_moder *irq_moder, u16 *irq_field, u16 attr_type, struct nlattr **tb, u8 coal_bit, bool *mod, struct netlink_ext_ack *extack) { int ret = 0; u32 val; if (!tb[attr_type]) return 0; if (irq_moder->coal_flags & coal_bit) { val = nla_get_u32(tb[attr_type]); if (*irq_field == val) return 0; *irq_field = val; *mod = true; } else { NL_SET_BAD_ATTR(extack, tb[attr_type]); ret = -EOPNOTSUPP; } return ret; } /** * ethnl_update_profile - get a profile nest with child nests from userspace. * @dev: netdevice to update the profile * @dst: profile get from the driver and modified by ethnl_update_profile. * @nests: nest attr ETHTOOL_A_COALESCE_*X_PROFILE to set profile. * @mod: pointer to bool for modification tracking * @extack: Netlink extended ack * * Layout of nests: * Nested ETHTOOL_A_COALESCE_*X_PROFILE attr * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr * ETHTOOL_A_IRQ_MODERATION_USEC attr * ETHTOOL_A_IRQ_MODERATION_PKTS attr * ETHTOOL_A_IRQ_MODERATION_COMPS attr * ... * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr * ETHTOOL_A_IRQ_MODERATION_USEC attr * ETHTOOL_A_IRQ_MODERATION_PKTS attr * ETHTOOL_A_IRQ_MODERATION_COMPS attr * * Return: 0 on success or a negative error code. */ static int ethnl_update_profile(struct net_device *dev, struct dim_cq_moder __rcu **dst, const struct nlattr *nests, bool *mod, struct netlink_ext_ack *extack) { int len_irq_moder = ARRAY_SIZE(coalesce_irq_moderation_policy); struct nlattr *tb[ARRAY_SIZE(coalesce_irq_moderation_policy)]; struct dim_irq_moder *irq_moder = dev->irq_moder; struct dim_cq_moder *new_profile, *old_profile; int ret, rem, i = 0, len; struct nlattr *nest; if (!nests) return 0; if (!*dst) return -EOPNOTSUPP; old_profile = rtnl_dereference(*dst); len = NET_DIM_PARAMS_NUM_PROFILES * sizeof(*old_profile); new_profile = kmemdup(old_profile, len, GFP_KERNEL); if (!new_profile) return -ENOMEM; nla_for_each_nested_type(nest, ETHTOOL_A_PROFILE_IRQ_MODERATION, nests, rem) { ret = nla_parse_nested(tb, len_irq_moder - 1, nest, coalesce_irq_moderation_policy, extack); if (ret) goto err_out; ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].usec, ETHTOOL_A_IRQ_MODERATION_USEC, tb, DIM_COALESCE_USEC, mod, extack); if (ret) goto err_out; ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].pkts, ETHTOOL_A_IRQ_MODERATION_PKTS, tb, DIM_COALESCE_PKTS, mod, extack); if (ret) goto err_out; ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].comps, ETHTOOL_A_IRQ_MODERATION_COMPS, tb, DIM_COALESCE_COMPS, mod, extack); if (ret) goto err_out; i++; } /* After the profile is modified, dim itself is a dynamic * mechanism and will quickly fit to the appropriate * coalescing parameters according to the new profile. */ rcu_assign_pointer(*dst, new_profile); kfree_rcu(old_profile, rcu); return 0; err_out: kfree(new_profile); return ret; } static int __ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info, bool *dual_change) { struct kernel_ethtool_coalesce kernel_coalesce = {}; struct net_device *dev = req_info->dev; struct ethtool_coalesce coalesce = {}; bool mod_mode = false, mod = false; struct nlattr **tb = info->attrs; int ret; ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, info->extack); if (ret < 0) return ret; /* Update values */ ethnl_update_u32(&coalesce.rx_coalesce_usecs, tb[ETHTOOL_A_COALESCE_RX_USECS], &mod); ethnl_update_u32(&coalesce.rx_max_coalesced_frames, tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES], &mod); ethnl_update_u32(&coalesce.rx_coalesce_usecs_irq, tb[ETHTOOL_A_COALESCE_RX_USECS_IRQ], &mod); ethnl_update_u32(&coalesce.rx_max_coalesced_frames_irq, tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ], &mod); ethnl_update_u32(&coalesce.tx_coalesce_usecs, tb[ETHTOOL_A_COALESCE_TX_USECS], &mod); ethnl_update_u32(&coalesce.tx_max_coalesced_frames, tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES], &mod); ethnl_update_u32(&coalesce.tx_coalesce_usecs_irq, tb[ETHTOOL_A_COALESCE_TX_USECS_IRQ], &mod); ethnl_update_u32(&coalesce.tx_max_coalesced_frames_irq, tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ], &mod); ethnl_update_u32(&coalesce.stats_block_coalesce_usecs, tb[ETHTOOL_A_COALESCE_STATS_BLOCK_USECS], &mod); ethnl_update_u32(&coalesce.pkt_rate_low, tb[ETHTOOL_A_COALESCE_PKT_RATE_LOW], &mod); ethnl_update_u32(&coalesce.rx_coalesce_usecs_low, tb[ETHTOOL_A_COALESCE_RX_USECS_LOW], &mod); ethnl_update_u32(&coalesce.rx_max_coalesced_frames_low, tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW], &mod); ethnl_update_u32(&coalesce.tx_coalesce_usecs_low, tb[ETHTOOL_A_COALESCE_TX_USECS_LOW], &mod); ethnl_update_u32(&coalesce.tx_max_coalesced_frames_low, tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW], &mod); ethnl_update_u32(&coalesce.pkt_rate_high, tb[ETHTOOL_A_COALESCE_PKT_RATE_HIGH], &mod); ethnl_update_u32(&coalesce.rx_coalesce_usecs_high, tb[ETHTOOL_A_COALESCE_RX_USECS_HIGH], &mod); ethnl_update_u32(&coalesce.rx_max_coalesced_frames_high, tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH], &mod); ethnl_update_u32(&coalesce.tx_coalesce_usecs_high, tb[ETHTOOL_A_COALESCE_TX_USECS_HIGH], &mod); ethnl_update_u32(&coalesce.tx_max_coalesced_frames_high, tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH], &mod); ethnl_update_u32(&coalesce.rate_sample_interval, tb[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL], &mod); ethnl_update_u32(&kernel_coalesce.tx_aggr_max_bytes, tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES], &mod); ethnl_update_u32(&kernel_coalesce.tx_aggr_max_frames, tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES], &mod); ethnl_update_u32(&kernel_coalesce.tx_aggr_time_usecs, tb[ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS], &mod); if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_RX) { ret = ethnl_update_profile(dev, &dev->irq_moder->rx_profile, tb[ETHTOOL_A_COALESCE_RX_PROFILE], &mod, info->extack); if (ret < 0) return ret; } if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_TX) { ret = ethnl_update_profile(dev, &dev->irq_moder->tx_profile, tb[ETHTOOL_A_COALESCE_TX_PROFILE], &mod, info->extack); if (ret < 0) return ret; } /* Update operation modes */ ethnl_update_bool32(&coalesce.use_adaptive_rx_coalesce, tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX], &mod_mode); ethnl_update_bool32(&coalesce.use_adaptive_tx_coalesce, tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX], &mod_mode); ethnl_update_u8(&kernel_coalesce.use_cqe_mode_tx, tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX], &mod_mode); ethnl_update_u8(&kernel_coalesce.use_cqe_mode_rx, tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX], &mod_mode); *dual_change = mod && mod_mode; if (!mod && !mod_mode) return 0; ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce, info->extack); return ret < 0 ? ret : 1; } static int ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info) { bool dual_change; int err, ret; /* SET_COALESCE may change operation mode and parameters in one call. * Changing operation mode may cause the driver to reset the parameter * values, and therefore ignore user input (driver does not know which * parameters come from user and which are echoed back from ->get). * To not complicate the drivers if user tries to change both the mode * and parameters at once - call the driver twice. */ err = __ethnl_set_coalesce(req_info, info, &dual_change); if (err < 0) return err; ret = err; if (ret && dual_change) { err = __ethnl_set_coalesce(req_info, info, &dual_change); if (err < 0) return err; } return ret; } const struct ethnl_request_ops ethnl_coalesce_request_ops = { .request_cmd = ETHTOOL_MSG_COALESCE_GET, .reply_cmd = ETHTOOL_MSG_COALESCE_GET_REPLY, .hdr_attr = ETHTOOL_A_COALESCE_HEADER, .req_info_size = sizeof(struct coalesce_req_info), .reply_data_size = sizeof(struct coalesce_reply_data), .prepare_data = coalesce_prepare_data, .reply_size = coalesce_reply_size, .fill_reply = coalesce_fill_reply, .set_validate = ethnl_set_coalesce_validate, .set = ethnl_set_coalesce, .set_ntf_cmd = ETHTOOL_MSG_COALESCE_NTF, }; |
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 6 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 | // SPDX-License-Identifier: GPL-2.0 #include <linux/smp.h> #include <linux/timex.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/cpufreq.h> #include <asm/prctl.h> #include <linux/proc_fs.h> #include "cpu.h" #ifdef CONFIG_X86_VMX_FEATURE_NAMES extern const char * const x86_vmx_flags[NVMXINTS*32]; #endif /* * Get CPU information for use by the procfs. */ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, unsigned int cpu) { #ifdef CONFIG_SMP seq_printf(m, "physical id\t: %d\n", c->topo.pkg_id); seq_printf(m, "siblings\t: %d\n", cpumask_weight(topology_core_cpumask(cpu))); seq_printf(m, "core id\t\t: %d\n", c->topo.core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); seq_printf(m, "apicid\t\t: %d\n", c->topo.apicid); seq_printf(m, "initial apicid\t: %d\n", c->topo.initial_apicid); #endif } #ifdef CONFIG_X86_32 static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) { seq_printf(m, "fdiv_bug\t: %s\n" "f00f_bug\t: %s\n" "coma_bug\t: %s\n" "fpu\t\t: %s\n" "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: yes\n", str_yes_no(boot_cpu_has_bug(X86_BUG_FDIV)), str_yes_no(boot_cpu_has_bug(X86_BUG_F00F)), str_yes_no(boot_cpu_has_bug(X86_BUG_COMA)), str_yes_no(boot_cpu_has(X86_FEATURE_FPU)), str_yes_no(boot_cpu_has(X86_FEATURE_FPU)), c->cpuid_level); } #else static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) { seq_printf(m, "fpu\t\t: yes\n" "fpu_exception\t: yes\n" "cpuid level\t: %d\n" "wp\t\t: yes\n", c->cpuid_level); } #endif static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; unsigned int cpu; int i; cpu = c->cpu_index; seq_printf(m, "processor\t: %u\n" "vendor_id\t: %s\n" "cpu family\t: %d\n" "model\t\t: %u\n" "model name\t: %s\n", cpu, c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", c->x86, c->x86_model, c->x86_model_id[0] ? c->x86_model_id : "unknown"); if (c->x86_stepping || c->cpuid_level >= 0) seq_printf(m, "stepping\t: %d\n", c->x86_stepping); else seq_puts(m, "stepping\t: unknown\n"); if (c->microcode) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { int freq = arch_freq_get_on_cpu(cpu); if (freq < 0) seq_puts(m, "cpu MHz\t\t: Unknown\n"); else seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } /* Cache size */ if (c->x86_cache_size) seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size); show_cpuinfo_core(m, c, cpu); show_cpuinfo_misc(m, c); seq_puts(m, "flags\t\t:"); for (i = 0; i < 32*NCAPINTS; i++) if (cpu_has(c, i) && x86_cap_flags[i] != NULL) seq_printf(m, " %s", x86_cap_flags[i]); #ifdef CONFIG_X86_VMX_FEATURE_NAMES if (cpu_has(c, X86_FEATURE_VMX) && c->vmx_capability[0]) { seq_puts(m, "\nvmx flags\t:"); for (i = 0; i < 32*NVMXINTS; i++) { if (test_bit(i, (unsigned long *)c->vmx_capability) && x86_vmx_flags[i] != NULL) seq_printf(m, " %s", x86_vmx_flags[i]); } } #endif seq_puts(m, "\nbugs\t\t:"); for (i = 0; i < 32*NBUGINTS; i++) { unsigned int bug_bit = 32*NCAPINTS + i; if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i]) seq_printf(m, " %s", x86_bug_flags[i]); } seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); #ifdef CONFIG_X86_64 if (c->x86_tlbsize > 0) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); #endif seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); seq_puts(m, "power management:"); for (i = 0; i < 32; i++) { if (c->x86_power & (1 << i)) { if (i < ARRAY_SIZE(x86_power_flags) && x86_power_flags[i]) seq_printf(m, "%s%s", x86_power_flags[i][0] ? " " : "", x86_power_flags[i]); else seq_printf(m, " [%d]", i); } } seq_puts(m, "\n\n"); return 0; } static void *c_start(struct seq_file *m, loff_t *pos) { *pos = cpumask_next(*pos - 1, cpu_online_mask); if ((*pos) < nr_cpu_ids) return &cpu_data(*pos); return NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return c_start(m, pos); } static void c_stop(struct seq_file *m, void *v) { } const struct seq_operations cpuinfo_op = { .start = c_start, .next = c_next, .stop = c_stop, .show = show_cpuinfo, }; #ifdef CONFIG_X86_USER_SHADOW_STACK static void dump_x86_features(struct seq_file *m, unsigned long features) { if (features & ARCH_SHSTK_SHSTK) seq_puts(m, "shstk "); if (features & ARCH_SHSTK_WRSS) seq_puts(m, "wrss "); } void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task) { seq_puts(m, "x86_Thread_features:\t"); dump_x86_features(m, task->thread.features); seq_putc(m, '\n'); seq_puts(m, "x86_Thread_features_locked:\t"); dump_x86_features(m, task->thread.features_locked); seq_putc(m, '\n'); } #endif /* CONFIG_X86_USER_SHADOW_STACK */ |
19 12 32 2 31 31 19 13 5 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 | // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <net/ip.h> #include <net/tcp.h> #include <net/route.h> #include <net/dst.h> #include <net/netfilter/ipv4/nf_reject.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> static int nf_reject_iphdr_validate(struct sk_buff *skb) { struct iphdr *iph; u32 len; if (!pskb_may_pull(skb, sizeof(struct iphdr))) return 0; iph = ip_hdr(skb); if (iph->ihl < 5 || iph->version != 4) return 0; len = ntohs(iph->tot_len); if (skb->len < len) return 0; else if (len < (iph->ihl*4)) return 0; if (!pskb_may_pull(skb, iph->ihl*4)) return 0; return 1; } struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, int hook) { const struct tcphdr *oth; struct sk_buff *nskb; struct iphdr *niph; struct tcphdr _oth; if (!nf_reject_iphdr_validate(oldskb)) return NULL; oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook); if (!oth) return NULL; nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + LL_MAX_HEADER, GFP_ATOMIC); if (!nskb) return NULL; nskb->dev = (struct net_device *)dev; skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); niph->tot_len = htons(nskb->len); ip_send_check(niph); return nskb; } EXPORT_SYMBOL_GPL(nf_reject_skb_v4_tcp_reset); struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, int hook, u8 code) { struct sk_buff *nskb; struct iphdr *niph; struct icmphdr *icmph; unsigned int len; int dataoff; __wsum csum; u8 proto; if (!nf_reject_iphdr_validate(oldskb)) return NULL; /* IP header checks: fragment. */ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) return NULL; /* RFC says return as much as we can without exceeding 576 bytes. */ len = min_t(unsigned int, 536, oldskb->len); if (!pskb_may_pull(oldskb, len)) return NULL; if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len))) return NULL; dataoff = ip_hdrlen(oldskb); proto = ip_hdr(oldskb)->protocol; if (!skb_csum_unnecessary(oldskb) && nf_reject_verify_csum(oldskb, dataoff, proto) && nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto)) return NULL; nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) + LL_MAX_HEADER + len, GFP_ATOMIC); if (!nskb) return NULL; nskb->dev = (struct net_device *)dev; skb_reserve(nskb, LL_MAX_HEADER); niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP, READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); skb_reset_transport_header(nskb); icmph = skb_put_zero(nskb, sizeof(struct icmphdr)); icmph->type = ICMP_DEST_UNREACH; icmph->code = code; skb_put_data(nskb, skb_network_header(oldskb), len); csum = csum_partial((void *)icmph, len + sizeof(struct icmphdr), 0); icmph->checksum = csum_fold(csum); niph->tot_len = htons(nskb->len); ip_send_check(niph); return nskb; } EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach); const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *_oth, int hook) { const struct tcphdr *oth; /* IP header checks: fragment. */ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) return NULL; if (ip_hdr(oldskb)->protocol != IPPROTO_TCP) return NULL; oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb), sizeof(struct tcphdr), _oth); if (oth == NULL) return NULL; /* No RST for RST. */ if (oth->rst) return NULL; /* Check checksum */ if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP)) return NULL; return oth; } EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get); struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, __u8 protocol, int ttl) { struct iphdr *niph, *oiph = ip_hdr(oldskb); skb_reset_network_header(nskb); niph = skb_put(nskb, sizeof(struct iphdr)); niph->version = 4; niph->ihl = sizeof(struct iphdr) / 4; niph->tos = 0; niph->id = 0; niph->frag_off = htons(IP_DF); niph->protocol = protocol; niph->check = 0; niph->saddr = oiph->daddr; niph->daddr = oiph->saddr; niph->ttl = ttl; nskb->protocol = htons(ETH_P_IP); return niph; } EXPORT_SYMBOL_GPL(nf_reject_iphdr_put); void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, const struct tcphdr *oth) { struct iphdr *niph = ip_hdr(nskb); struct tcphdr *tcph; skb_reset_transport_header(nskb); tcph = skb_put_zero(nskb, sizeof(struct tcphdr)); tcph->source = oth->dest; tcph->dest = oth->source; tcph->doff = sizeof(struct tcphdr) / 4; if (oth->ack) { tcph->seq = oth->ack_seq; } else { tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + oldskb->len - ip_hdrlen(oldskb) - (oth->doff << 2)); tcph->ack = 1; } tcph->rst = 1; tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, niph->daddr, 0); nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)tcph - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); } EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) { struct dst_entry *dst = NULL; struct flowi fl; memset(&fl, 0, sizeof(struct flowi)); fl.u.ip4.daddr = ip_hdr(skb_in)->saddr; nf_ip_route(dev_net(skb_in->dev), &dst, &fl, false); if (!dst) return -1; skb_dst_set(skb_in, dst); return 0; } /* Send RST reply */ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook) { const struct tcphdr *oth; struct sk_buff *nskb; struct tcphdr _oth; oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook); if (!oth) return; if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && nf_reject_fill_skb_dst(oldskb) < 0) return; if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) return; nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + LL_MAX_HEADER, GFP_ATOMIC); if (!nskb) return; /* ip_route_me_harder expects skb->dst to be set */ skb_dst_set_noref(nskb, skb_dst(oldskb)); nskb->mark = IP4_REPLY_MARK(net, oldskb->mark); skb_reserve(nskb, LL_MAX_HEADER); nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, ip4_dst_hoplimit(skb_dst(nskb))); nf_reject_ip_tcphdr_put(nskb, oldskb, oth); if (ip_route_me_harder(net, sk, nskb, RTN_UNSPEC)) goto free_nskb; /* "Never happens" */ if (nskb->len > dst_mtu(skb_dst(nskb))) goto free_nskb; nf_ct_attach(nskb, oldskb); nf_ct_set_closing(skb_nfct(oldskb)); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip_local_out for bridged traffic, the MAC source on * the RST will be ours, instead of the destination's. This confuses * some routers/firewalls, and they drop the packet. So we need to * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ if (nf_bridge_info_exists(oldskb)) { struct ethhdr *oeth = eth_hdr(oldskb); struct iphdr *niph = ip_hdr(nskb); struct net_device *br_indev; br_indev = nf_bridge_get_physindev(oldskb, net); if (!br_indev) goto free_nskb; nskb->dev = br_indev; niph->tot_len = htons(nskb->len); ip_send_check(niph); if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), oeth->h_source, oeth->h_dest, nskb->len) < 0) goto free_nskb; dev_queue_xmit(nskb); } else #endif ip_local_out(net, nskb->sk, nskb); return; free_nskb: kfree_skb(nskb); } EXPORT_SYMBOL_GPL(nf_send_reset); void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) { struct iphdr *iph = ip_hdr(skb_in); int dataoff = ip_hdrlen(skb_in); u8 proto = iph->protocol; if (iph->frag_off & htons(IP_OFFSET)) return; if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && nf_reject_fill_skb_dst(skb_in) < 0) return; if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(skb_in, dataoff, proto)) { icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); return; } if (nf_ip_checksum(skb_in, hook, dataoff, proto) == 0) icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); } EXPORT_SYMBOL_GPL(nf_send_unreach); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv4 packet rejection core"); |
52 15326 4790 12886 7567 196 624 4868 1 10753 33 6780 6369 137 21751 8054 37 37 82 6 272 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_H #define _LINUX_RCULIST_H #ifdef __KERNEL__ /* * RCU-protected list version */ #include <linux/list.h> #include <linux/rcupdate.h> /* * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers * @list: list to be initialized * * You should instead use INIT_LIST_HEAD() for normal initialization and * cleanup tasks, when readers have no access to the list being initialized. * However, if the list being initialized is visible to readers, you * need to keep the compiler from being too mischievous. */ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } /* * return the ->next pointer of a list_head in an rcu safe * way, we must not access it directly */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) /* * Return the ->prev pointer of a list_head in an rcu safe way. Don't * access it directly. * * Any list traversed with list_bidir_prev_rcu() must never use * list_del_rcu(). Doing so will poison the ->prev pointer that * list_bidir_prev_rcu() relies on, which will result in segfaults. * To prevent these segfaults, use list_bidir_del_rcu() instead * of list_del_rcu(). */ #define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev))) /** * list_tail_rcu - returns the prev pointer of the head of the list * @head: the head of the list * * Note: This should only be used with the list header, and even then * only if list_del() and similar primitives are not also used on the * list header. */ #define list_tail_rcu(head) (*((struct list_head __rcu **)(&(head)->prev))) /* * Check during list traversal that we are within an RCU reader */ #define check_arg_count_one(dummy) #ifdef CONFIG_PROVE_RCU_LIST #define __list_check_rcu(dummy, cond, extra...) \ ({ \ check_arg_count_one(extra); \ RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(), \ "RCU-list traversed in non-reader section!"); \ }) #define __list_check_srcu(cond) \ ({ \ RCU_LOCKDEP_WARN(!(cond), \ "RCU-list traversed without holding the required lock!");\ }) #else #define __list_check_rcu(dummy, cond, extra...) \ ({ check_arg_count_one(extra); }) #define __list_check_srcu(cond) ({ }) #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add_rcu(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; new->next = next; new->prev = prev; rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } /** * list_add_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head, head->next); } /** * list_add_tail_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_tail_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_tail_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head->prev, head); } /** * list_del_rcu - deletes entry from list without re-initialization * @entry: the element to delete from the list. * * Note: list_empty() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_del_rcu() * or list_add_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). * * Note that the caller is not permitted to immediately free * the newly deleted entry. Instead, either synchronize_rcu() * or call_rcu() must be used to defer freeing until an RCU * grace period has elapsed. */ static inline void list_del_rcu(struct list_head *entry) { __list_del_entry(entry); entry->prev = LIST_POISON2; } /** * list_bidir_del_rcu - deletes entry from list without re-initialization * @entry: the element to delete from the list. * * In contrast to list_del_rcu() doesn't poison the prev pointer thus * allowing backwards traversal via list_bidir_prev_rcu(). * * Note: list_empty() on entry does not return true after this because * the entry is in a special undefined state that permits RCU-based * lockfree reverse traversal. In particular this means that we can not * poison the forward and backwards pointers that may still be used for * walking the list. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another list-mutation * primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on * this same list. However, it is perfectly legal to run concurrently * with the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). * * Note that list_del_rcu() and list_bidir_del_rcu() must not be used on * the same list. * * Note that the caller is not permitted to immediately free * the newly deleted entry. Instead, either synchronize_rcu() * or call_rcu() must be used to defer freeing until an RCU * grace period has elapsed. */ static inline void list_bidir_del_rcu(struct list_head *entry) { __list_del_entry(entry); } /** * hlist_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. * * Note: list_unhashed() on the node return true after this. It is * useful for RCU based read lockfree traversal if the writer side * must know if the list entry is still hashed or already unhashed. * * In particular, it means that we can not poison the forward pointers * that may still be used for walking the hash list and we can only * zero the pprev pointer so list_unhashed() will return true after * this. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another * list-mutation primitive, such as hlist_add_head_rcu() or * hlist_del_rcu(), running on this same list. However, it is * perfectly legal to run concurrently with the _rcu list-traversal * primitives, such as hlist_for_each_entry_rcu(). */ static inline void hlist_del_init_rcu(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); WRITE_ONCE(n->pprev, NULL); } } /** * list_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The @old entry will be replaced with the @new entry atomically from * the perspective of concurrent readers. It is the caller's responsibility * to synchronize with concurrent updaters, if any. * * Note: @old should not be empty. */ static inline void list_replace_rcu(struct list_head *old, struct list_head *new) { new->next = old->next; new->prev = old->prev; rcu_assign_pointer(list_next_rcu(new->prev), new); new->next->prev = new; old->prev = LIST_POISON2; } /** * __list_splice_init_rcu - join an RCU-protected list into an existing list. * @list: the RCU-protected list to splice * @prev: points to the last element of the existing list * @next: points to the first element of the existing list * @sync: synchronize_rcu, synchronize_rcu_expedited, ... * * The list pointed to by @prev and @next can be RCU-read traversed * concurrently with this function. * * Note that this function blocks. * * Important note: the caller must take whatever action is necessary to prevent * any other updates to the existing list. In principle, it is possible to * modify the list as soon as sync() begins execution. If this sort of thing * becomes necessary, an alternative version based on call_rcu() could be * created. But only if -really- needed -- there is no shortage of RCU API * members. */ static inline void __list_splice_init_rcu(struct list_head *list, struct list_head *prev, struct list_head *next, void (*sync)(void)) { struct list_head *first = list->next; struct list_head *last = list->prev; /* * "first" and "last" tracking list, so initialize it. RCU readers * have access to this list, so we must use INIT_LIST_HEAD_RCU() * instead of INIT_LIST_HEAD(). */ INIT_LIST_HEAD_RCU(list); /* * At this point, the list body still points to the source list. * Wait for any readers to finish using the list before splicing * the list body into the new list. Any new readers will see * an empty list. */ sync(); ASSERT_EXCLUSIVE_ACCESS(*first); ASSERT_EXCLUSIVE_ACCESS(*last); /* * Readers are finished with the source list, so perform splice. * The order is important if the new list is global and accessible * to concurrent RCU readers. Note that RCU readers are not * permitted to traverse the prev pointers without excluding * this function. */ last->next = next; rcu_assign_pointer(list_next_rcu(prev), first); first->prev = prev; next->prev = last; } /** * list_splice_init_rcu - splice an RCU-protected list into an existing list, * designed for stacks. * @list: the RCU-protected list to splice * @head: the place in the existing list to splice the first list into * @sync: synchronize_rcu, synchronize_rcu_expedited, ... */ static inline void list_splice_init_rcu(struct list_head *list, struct list_head *head, void (*sync)(void)) { if (!list_empty(list)) __list_splice_init_rcu(list, head, head->next, sync); } /** * list_splice_tail_init_rcu - splice an RCU-protected list into an existing * list, designed for queues. * @list: the RCU-protected list to splice * @head: the place in the existing list to splice the first list into * @sync: synchronize_rcu, synchronize_rcu_expedited, ... */ static inline void list_splice_tail_init_rcu(struct list_head *list, struct list_head *head, void (*sync)(void)) { if (!list_empty(list)) __list_splice_init_rcu(list, head->prev, head, sync); } /** * list_entry_rcu - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ container_of(READ_ONCE(ptr), type, member) /* * Where are list_empty_rcu() and list_first_entry_rcu()? * * They do not exist because they would lead to subtle race conditions: * * if (!list_empty_rcu(mylist)) { * struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member); * do_something(bar); * } * * The list might be non-empty when list_empty_rcu() checks it, but it * might have become empty by the time that list_first_entry_rcu() rereads * the ->next pointer, which would result in a SEGV. * * When not using RCU, it is OK for list_first_entry() to re-read that * pointer because both functions should be protected by some lock that * blocks writers. * * When using RCU, list_empty() uses READ_ONCE() to fetch the * RCU-protected ->next pointer and then compares it to the address of the * list head. However, it neither dereferences this pointer nor provides * this pointer to its caller. Thus, READ_ONCE() suffices (that is, * rcu_dereference() is not needed), which means that list_empty() can be * used anywhere you would want to use list_empty_rcu(). Just don't * expect anything useful to happen if you do a subsequent lockless * call to list_first_entry_rcu()!!! * * See list_first_or_null_rcu for an alternative. */ /** * list_first_or_null_rcu - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_first_or_null_rcu(ptr, type, member) \ ({ \ struct list_head *__ptr = (ptr); \ struct list_head *__next = READ_ONCE(__ptr->next); \ likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \ }) /** * list_next_or_null_rcu - get the next element from a list * @head: the head for the list. * @ptr: the list head to take the next element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the ptr is at the end of the list, NULL is returned. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_next_or_null_rcu(head, ptr, type, member) \ ({ \ struct list_head *__head = (head); \ struct list_head *__ptr = (ptr); \ struct list_head *__next = READ_ONCE(__ptr->next); \ likely(__next != __head) ? list_entry_rcu(__next, type, \ member) : NULL; \ }) /** * list_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_entry_rcu(pos, head, member, cond...) \ for (__list_check_rcu(dummy, ## cond, 0), \ pos = list_entry_rcu((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_srcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * @cond: lockdep expression for the lock required to traverse the list. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by srcu_read_lock(). * The lockdep expression srcu_read_lock_held() can be passed as the * cond argument from read side. */ #define list_for_each_entry_srcu(pos, head, member, cond) \ for (__list_check_srcu(cond), \ pos = list_entry_rcu((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_entry_lockless - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * This primitive may safely run concurrently with the _rcu * list-mutation primitives such as list_add_rcu(), but requires some * implicit RCU read-side guarding. One example is running within a special * exception-time environment where preemption is disabled and where lockdep * cannot be invoked. Another example is when items are added to the list, * but never deleted. */ #define list_entry_lockless(ptr, type, member) \ container_of((typeof(ptr))READ_ONCE(ptr), type, member) /** * list_for_each_entry_lockless - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * This primitive may safely run concurrently with the _rcu * list-mutation primitives such as list_add_rcu(), but requires some * implicit RCU read-side guarding. One example is running within a special * exception-time environment where preemption is disabled and where lockdep * cannot be invoked. Another example is when items are added to the list, * but never deleted. */ #define list_for_each_entry_lockless(pos, head, member) \ for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_lockless(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_continue_rcu - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position which must have been in the list when the RCU read * lock was taken. * This would typically require either that you obtained the node from a * previous walk of the list in the same RCU read-side critical section, or * that you held some sort of non-RCU reference (such as a reference count) * to keep the node alive *and* in the list. * * This iterator is similar to list_for_each_entry_from_rcu() except * this starts after the given position and that one starts at the given * position. */ #define list_for_each_entry_continue_rcu(pos, head, member) \ for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_from_rcu - iterate over a list from current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_node within the struct. * * Iterate over the tail of a list starting from a given position, * which must have been in the list when the RCU read lock was taken. * This would typically require either that you obtained the node from a * previous walk of the list in the same RCU read-side critical section, or * that you held some sort of non-RCU reference (such as a reference count) * to keep the node alive *and* in the list. * * This iterator is similar to list_for_each_entry_continue_rcu() except * this starts from the given position and that one starts from the position * after the given position. */ #define list_for_each_entry_from_rcu(pos, head, member) \ for (; &(pos)->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member)) /** * hlist_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: list_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry(). */ static inline void hlist_del_rcu(struct hlist_node *n) { __hlist_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The @old entry will be replaced with the @new entry atomically from * the perspective of concurrent readers. It is the caller's responsibility * to synchronize with concurrent updaters, if any. */ static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new) { struct hlist_node *next = old->next; new->next = next; WRITE_ONCE(new->pprev, old->pprev); rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new); if (next) WRITE_ONCE(new->next->pprev, &new->next); WRITE_ONCE(old->pprev, LIST_POISON2); } /** * hlists_swap_heads_rcu - swap the lists the hlist heads point to * @left: The hlist head on the left * @right: The hlist head on the right * * The lists start out as [@left ][node1 ... ] and * [@right ][node2 ... ] * The lists end up as [@left ][node2 ... ] * [@right ][node1 ... ] */ static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right) { struct hlist_node *node1 = left->first; struct hlist_node *node2 = right->first; rcu_assign_pointer(left->first, node2); rcu_assign_pointer(right->first, node1); WRITE_ONCE(node2->pprev, &left->first); WRITE_ONCE(node1->pprev, &right->first); } /* * return the first or the next element in an RCU protected hlist */ #define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first))) #define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next))) #define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev))) /** * hlist_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_first_rcu(h), n); if (first) WRITE_ONCE(first->pprev, &n->next); } /** * hlist_add_tail_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_add_tail_rcu(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *i, *last = NULL; /* Note: write side code, so rcu accessors are not needed. */ for (i = h->first; i; i = i->next) last = i; if (last) { n->next = last->next; WRITE_ONCE(n->pprev, &last->next); rcu_assign_pointer(hlist_next_rcu(last), n); } else { hlist_add_head_rcu(n, h); } } /** * hlist_add_before_rcu * @n: the new element to add to the hash list. * @next: the existing element to add the new element before. * * Description: * Adds the specified element to the specified hlist * before the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_before_rcu(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); n->next = next; rcu_assign_pointer(hlist_pprev_rcu(n), n); WRITE_ONCE(next->pprev, &n->next); } /** * hlist_add_behind_rcu * @n: the new element to add to the hash list. * @prev: the existing element to add the new element after. * * Description: * Adds the specified element to the specified hlist * after the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_behind_rcu(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; WRITE_ONCE(n->pprev, &prev->next); rcu_assign_pointer(hlist_next_rcu(prev), n); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } #define __hlist_for_each_rcu(pos, head) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos; \ pos = rcu_dereference(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu(pos, head, member, cond...) \ for (__list_check_rcu(dummy, ## cond, 0), \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_srcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * @cond: lockdep expression for the lock required to traverse the list. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by srcu_read_lock(). * The lockdep expression srcu_read_lock_held() can be passed as the * cond argument from read side. */ #define hlist_for_each_entry_srcu(pos, head, member, cond) \ for (__list_check_srcu(cond), \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing) * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). * * This is the same as hlist_for_each_entry_rcu() except that it does * not do any RCU debugging or tracing. */ #define hlist_for_each_entry_rcu_notrace(pos, head, member) \ for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu_bh(pos, head, member) \ for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu(pos, member) \ for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu_bh(pos, member) \ for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from_rcu(pos, member) \ for (; pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) #endif /* __KERNEL__ */ #endif |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/sunrpc/addr.h * * Various routines for copying and comparing sockaddrs and for * converting them to and from presentation format. */ #ifndef _LINUX_SUNRPC_ADDR_H #define _LINUX_SUNRPC_ADDR_H #include <linux/socket.h> #include <linux/in.h> #include <linux/in6.h> #include <net/ipv6.h> size_t rpc_ntop(const struct sockaddr *, char *, const size_t); size_t rpc_pton(struct net *, const char *, const size_t, struct sockaddr *, const size_t); char * rpc_sockaddr2uaddr(const struct sockaddr *, gfp_t); size_t rpc_uaddr2sockaddr(struct net *, const char *, const size_t, struct sockaddr *, const size_t); static inline unsigned short rpc_get_port(const struct sockaddr *sap) { switch (sap->sa_family) { case AF_INET: return ntohs(((struct sockaddr_in *)sap)->sin_port); case AF_INET6: return ntohs(((struct sockaddr_in6 *)sap)->sin6_port); } return 0; } static inline void rpc_set_port(struct sockaddr *sap, const unsigned short port) { switch (sap->sa_family) { case AF_INET: ((struct sockaddr_in *)sap)->sin_port = htons(port); break; case AF_INET6: ((struct sockaddr_in6 *)sap)->sin6_port = htons(port); break; } } #define IPV6_SCOPE_DELIMITER '%' #define IPV6_SCOPE_ID_LEN sizeof("%nnnnnnnnnn") static inline bool rpc_cmp_addr4(const struct sockaddr *sap1, const struct sockaddr *sap2) { const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1; const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2; return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr; } static inline bool __rpc_copy_addr4(struct sockaddr *dst, const struct sockaddr *src) { const struct sockaddr_in *ssin = (struct sockaddr_in *) src; struct sockaddr_in *dsin = (struct sockaddr_in *) dst; dsin->sin_family = ssin->sin_family; dsin->sin_addr.s_addr = ssin->sin_addr.s_addr; return true; } #if IS_ENABLED(CONFIG_IPV6) static inline bool rpc_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr *sap2) { const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1; const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2; if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr)) return false; else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL) return sin1->sin6_scope_id == sin2->sin6_scope_id; return true; } static inline bool __rpc_copy_addr6(struct sockaddr *dst, const struct sockaddr *src) { const struct sockaddr_in6 *ssin6 = (const struct sockaddr_in6 *) src; struct sockaddr_in6 *dsin6 = (struct sockaddr_in6 *) dst; dsin6->sin6_family = ssin6->sin6_family; dsin6->sin6_addr = ssin6->sin6_addr; dsin6->sin6_scope_id = ssin6->sin6_scope_id; return true; } #else /* !(IS_ENABLED(CONFIG_IPV6) */ static inline bool rpc_cmp_addr6(const struct sockaddr *sap1, const struct sockaddr *sap2) { return false; } static inline bool __rpc_copy_addr6(struct sockaddr *dst, const struct sockaddr *src) { return false; } #endif /* !(IS_ENABLED(CONFIG_IPV6) */ /** * rpc_cmp_addr - compare the address portion of two sockaddrs. * @sap1: first sockaddr * @sap2: second sockaddr * * Just compares the family and address portion. Ignores port, but * compares the scope if it's a link-local address. * * Returns true if the addrs are equal, false if they aren't. */ static inline bool rpc_cmp_addr(const struct sockaddr *sap1, const struct sockaddr *sap2) { if (sap1->sa_family == sap2->sa_family) { switch (sap1->sa_family) { case AF_INET: return rpc_cmp_addr4(sap1, sap2); case AF_INET6: return rpc_cmp_addr6(sap1, sap2); } } return false; } /** * rpc_cmp_addr_port - compare the address and port number of two sockaddrs. * @sap1: first sockaddr * @sap2: second sockaddr */ static inline bool rpc_cmp_addr_port(const struct sockaddr *sap1, const struct sockaddr *sap2) { if (!rpc_cmp_addr(sap1, sap2)) return false; return rpc_get_port(sap1) == rpc_get_port(sap2); } /** * rpc_copy_addr - copy the address portion of one sockaddr to another * @dst: destination sockaddr * @src: source sockaddr * * Just copies the address portion and family. Ignores port, scope, etc. * Caller is responsible for making certain that dst is large enough to hold * the address in src. Returns true if address family is supported. Returns * false otherwise. */ static inline bool rpc_copy_addr(struct sockaddr *dst, const struct sockaddr *src) { switch (src->sa_family) { case AF_INET: return __rpc_copy_addr4(dst, src); case AF_INET6: return __rpc_copy_addr6(dst, src); } return false; } /** * rpc_get_scope_id - return scopeid for a given sockaddr * @sa: sockaddr to get scopeid from * * Returns the value of the sin6_scope_id for AF_INET6 addrs, or 0 if * not an AF_INET6 address. */ static inline u32 rpc_get_scope_id(const struct sockaddr *sa) { if (sa->sa_family != AF_INET6) return 0; return ((struct sockaddr_in6 *) sa)->sin6_scope_id; } #endif /* _LINUX_SUNRPC_ADDR_H */ |
469 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 | // SPDX-License-Identifier: GPL-2.0 /* Bareudp: UDP tunnel encasulation for different Payload types like * MPLS, NSH, IP, etc. * Copyright (c) 2019 Nokia, Inc. * Authors: Martin Varghese, <martin.varghese@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/hash.h> #include <net/dst_metadata.h> #include <net/gro_cells.h> #include <net/rtnetlink.h> #include <net/protocol.h> #include <net/ip6_tunnel.h> #include <net/ip_tunnels.h> #include <net/udp_tunnel.h> #include <net/bareudp.h> #define BAREUDP_BASE_HLEN sizeof(struct udphdr) #define BAREUDP_IPV4_HLEN (sizeof(struct iphdr) + \ sizeof(struct udphdr)) #define BAREUDP_IPV6_HLEN (sizeof(struct ipv6hdr) + \ sizeof(struct udphdr)) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); /* per-network namespace private data for this module */ static unsigned int bareudp_net_id; struct bareudp_net { struct list_head bareudp_list; }; struct bareudp_conf { __be16 ethertype; __be16 port; u16 sport_min; bool multi_proto_mode; }; /* Pseudo network device */ struct bareudp_dev { struct net *net; /* netns for packet i/o */ struct net_device *dev; /* netdev for bareudp tunnel */ __be16 ethertype; __be16 port; u16 sport_min; bool multi_proto_mode; struct socket __rcu *sock; struct list_head next; /* bareudp node on namespace list */ struct gro_cells gro_cells; }; static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct metadata_dst *tun_dst = NULL; IP_TUNNEL_DECLARE_FLAGS(key) = { }; struct bareudp_dev *bareudp; unsigned short family; unsigned int len; __be16 proto; void *oiph; int err; int nh; bareudp = rcu_dereference_sk_user_data(sk); if (!bareudp) goto drop; if (skb->protocol == htons(ETH_P_IP)) family = AF_INET; else family = AF_INET6; if (bareudp->ethertype == htons(ETH_P_IP)) { __u8 ipversion; if (skb_copy_bits(skb, BAREUDP_BASE_HLEN, &ipversion, sizeof(ipversion))) { dev_dstats_rx_dropped(bareudp->dev); goto drop; } ipversion >>= 4; if (ipversion == 4) { proto = htons(ETH_P_IP); } else if (ipversion == 6 && bareudp->multi_proto_mode) { proto = htons(ETH_P_IPV6); } else { dev_dstats_rx_dropped(bareudp->dev); goto drop; } } else if (bareudp->ethertype == htons(ETH_P_MPLS_UC)) { struct iphdr *tunnel_hdr; tunnel_hdr = (struct iphdr *)skb_network_header(skb); if (tunnel_hdr->version == 4) { if (!ipv4_is_multicast(tunnel_hdr->daddr)) { proto = bareudp->ethertype; } else if (bareudp->multi_proto_mode && ipv4_is_multicast(tunnel_hdr->daddr)) { proto = htons(ETH_P_MPLS_MC); } else { dev_dstats_rx_dropped(bareudp->dev); goto drop; } } else { int addr_type; struct ipv6hdr *tunnel_hdr_v6; tunnel_hdr_v6 = (struct ipv6hdr *)skb_network_header(skb); addr_type = ipv6_addr_type((struct in6_addr *)&tunnel_hdr_v6->daddr); if (!(addr_type & IPV6_ADDR_MULTICAST)) { proto = bareudp->ethertype; } else if (bareudp->multi_proto_mode && (addr_type & IPV6_ADDR_MULTICAST)) { proto = htons(ETH_P_MPLS_MC); } else { dev_dstats_rx_dropped(bareudp->dev); goto drop; } } } else { proto = bareudp->ethertype; } if (iptunnel_pull_header(skb, BAREUDP_BASE_HLEN, proto, !net_eq(bareudp->net, dev_net(bareudp->dev)))) { dev_dstats_rx_dropped(bareudp->dev); goto drop; } __set_bit(IP_TUNNEL_KEY_BIT, key); tun_dst = udp_tun_rx_dst(skb, family, key, 0, 0); if (!tun_dst) { dev_dstats_rx_dropped(bareudp->dev); goto drop; } skb_dst_set(skb, &tun_dst->dst); skb->dev = bareudp->dev; skb_reset_mac_header(skb); /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(bareudp->dev, rx_length_errors); DEV_STATS_INC(bareudp->dev, rx_errors); goto drop; } /* Get the outer header. */ oiph = skb->head + nh; if (!ipv6_mod_enabled() || family == AF_INET) err = IP_ECN_decapsulate(oiph, skb); else err = IP6_ECN_decapsulate(oiph, skb); if (unlikely(err)) { if (log_ecn_error) { if (!ipv6_mod_enabled() || family == AF_INET) net_info_ratelimited("non-ECT from %pI4 " "with TOS=%#x\n", &((struct iphdr *)oiph)->saddr, ((struct iphdr *)oiph)->tos); else net_info_ratelimited("non-ECT from %pI6\n", &((struct ipv6hdr *)oiph)->saddr); } if (err > 1) { DEV_STATS_INC(bareudp->dev, rx_frame_errors); DEV_STATS_INC(bareudp->dev, rx_errors); goto drop; } } len = skb->len; err = gro_cells_receive(&bareudp->gro_cells, skb); if (likely(err == NET_RX_SUCCESS)) dev_dstats_rx_add(bareudp->dev, len); return 0; drop: /* Consume bad packet */ kfree_skb(skb); return 0; } static int bareudp_err_lookup(struct sock *sk, struct sk_buff *skb) { return 0; } static int bareudp_init(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); int err; err = gro_cells_init(&bareudp->gro_cells, dev); if (err) return err; return 0; } static void bareudp_uninit(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); gro_cells_destroy(&bareudp->gro_cells); } static struct socket *bareudp_create_sock(struct net *net, __be16 port) { struct udp_port_cfg udp_conf; struct socket *sock; int err; memset(&udp_conf, 0, sizeof(udp_conf)); if (ipv6_mod_enabled()) udp_conf.family = AF_INET6; else udp_conf.family = AF_INET; udp_conf.local_udp_port = port; /* Open UDP socket */ err = udp_sock_create(net, &udp_conf, &sock); if (err < 0) return ERR_PTR(err); udp_allow_gso(sock->sk); return sock; } /* Create new listen socket if needed */ static int bareudp_socket_create(struct bareudp_dev *bareudp, __be16 port) { struct udp_tunnel_sock_cfg tunnel_cfg; struct socket *sock; sock = bareudp_create_sock(bareudp->net, port); if (IS_ERR(sock)) return PTR_ERR(sock); /* Mark socket as an encapsulation socket */ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.sk_user_data = bareudp; tunnel_cfg.encap_type = 1; tunnel_cfg.encap_rcv = bareudp_udp_encap_recv; tunnel_cfg.encap_err_lookup = bareudp_err_lookup; tunnel_cfg.encap_destroy = NULL; setup_udp_tunnel_sock(bareudp->net, sock, &tunnel_cfg); rcu_assign_pointer(bareudp->sock, sock); return 0; } static int bareudp_open(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); int ret = 0; ret = bareudp_socket_create(bareudp, bareudp->port); return ret; } static void bareudp_sock_release(struct bareudp_dev *bareudp) { struct socket *sock; sock = bareudp->sock; rcu_assign_pointer(bareudp->sock, NULL); synchronize_net(); udp_tunnel_sock_release(sock); } static int bareudp_stop(struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); bareudp_sock_release(bareudp); return 0; } static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct bareudp_dev *bareudp, const struct ip_tunnel_info *info) { bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); bool xnet = !net_eq(bareudp->net, dev_net(bareudp->dev)); bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct socket *sock = rcu_dereference(bareudp->sock); const struct ip_tunnel_key *key = &info->key; struct rtable *rt; __be16 sport, df; int min_headroom; __u8 tos, ttl; __be32 saddr; int err; if (skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) return -EINVAL; if (!sock) return -ESHUTDOWN; sport = udp_flow_src_port(bareudp->net, skb, bareudp->sport_min, USHRT_MAX, true); rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, 0, &saddr, &info->key, sport, bareudp->port, key->tos, use_cache ? (struct dst_cache *)&info->dst_cache : NULL); if (IS_ERR(rt)) return PTR_ERR(rt); skb_tunnel_check_pmtu(skb, &rt->dst, BAREUDP_IPV4_HLEN + info->options_len, false); tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb); ttl = key->ttl; df = test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) ? htons(IP_DF) : 0; skb_scrub_packet(skb, xnet); err = -ENOSPC; if (!skb_pull(skb, skb_network_offset(skb))) goto free_dst; min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + BAREUDP_BASE_HLEN + info->options_len + sizeof(struct iphdr); err = skb_cow_head(skb, min_headroom); if (unlikely(err)) goto free_dst; err = udp_tunnel_handle_offloads(skb, udp_sum); if (err) goto free_dst; skb_set_inner_protocol(skb, bareudp->ethertype); udp_tunnel_xmit_skb(rt, sock->sk, skb, saddr, info->key.u.ipv4.dst, tos, ttl, df, sport, bareudp->port, !net_eq(bareudp->net, dev_net(bareudp->dev)), !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)); return 0; free_dst: dst_release(&rt->dst); return err; } static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct bareudp_dev *bareudp, const struct ip_tunnel_info *info) { bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); bool xnet = !net_eq(bareudp->net, dev_net(bareudp->dev)); bool use_cache = ip_tunnel_dst_cache_usable(skb, info); struct socket *sock = rcu_dereference(bareudp->sock); const struct ip_tunnel_key *key = &info->key; struct dst_entry *dst = NULL; struct in6_addr saddr, daddr; int min_headroom; __u8 prio, ttl; __be16 sport; int err; if (skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) return -EINVAL; if (!sock) return -ESHUTDOWN; sport = udp_flow_src_port(bareudp->net, skb, bareudp->sport_min, USHRT_MAX, true); dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock, 0, &saddr, key, sport, bareudp->port, key->tos, use_cache ? (struct dst_cache *) &info->dst_cache : NULL); if (IS_ERR(dst)) return PTR_ERR(dst); skb_tunnel_check_pmtu(skb, dst, BAREUDP_IPV6_HLEN + info->options_len, false); prio = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb); ttl = key->ttl; skb_scrub_packet(skb, xnet); err = -ENOSPC; if (!skb_pull(skb, skb_network_offset(skb))) goto free_dst; min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + BAREUDP_BASE_HLEN + info->options_len + sizeof(struct ipv6hdr); err = skb_cow_head(skb, min_headroom); if (unlikely(err)) goto free_dst; err = udp_tunnel_handle_offloads(skb, udp_sum); if (err) goto free_dst; daddr = info->key.u.ipv6.dst; udp_tunnel6_xmit_skb(dst, sock->sk, skb, dev, &saddr, &daddr, prio, ttl, info->key.label, sport, bareudp->port, !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags)); return 0; free_dst: dst_release(dst); return err; } static bool bareudp_proto_valid(struct bareudp_dev *bareudp, __be16 proto) { if (bareudp->ethertype == proto) return true; if (!bareudp->multi_proto_mode) return false; if (bareudp->ethertype == htons(ETH_P_MPLS_UC) && proto == htons(ETH_P_MPLS_MC)) return true; if (bareudp->ethertype == htons(ETH_P_IP) && proto == htons(ETH_P_IPV6)) return true; return false; } static netdev_tx_t bareudp_xmit(struct sk_buff *skb, struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); struct ip_tunnel_info *info = NULL; int err; if (!bareudp_proto_valid(bareudp, skb->protocol)) { err = -EINVAL; goto tx_error; } info = skb_tunnel_info(skb); if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { err = -EINVAL; goto tx_error; } rcu_read_lock(); if (ipv6_mod_enabled() && info->mode & IP_TUNNEL_INFO_IPV6) err = bareudp6_xmit_skb(skb, dev, bareudp, info); else err = bareudp_xmit_skb(skb, dev, bareudp, info); rcu_read_unlock(); if (likely(!err)) return NETDEV_TX_OK; tx_error: dev_kfree_skb(skb); if (err == -ELOOP) DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) DEV_STATS_INC(dev, tx_carrier_errors); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static int bareudp_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct ip_tunnel_info *info = skb_tunnel_info(skb); struct bareudp_dev *bareudp = netdev_priv(dev); bool use_cache; __be16 sport; use_cache = ip_tunnel_dst_cache_usable(skb, info); sport = udp_flow_src_port(bareudp->net, skb, bareudp->sport_min, USHRT_MAX, true); if (!ipv6_mod_enabled() || ip_tunnel_info_af(info) == AF_INET) { struct rtable *rt; __be32 saddr; rt = udp_tunnel_dst_lookup(skb, dev, bareudp->net, 0, &saddr, &info->key, sport, bareudp->port, info->key.tos, use_cache ? &info->dst_cache : NULL); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); info->key.u.ipv4.src = saddr; } else if (ip_tunnel_info_af(info) == AF_INET6) { struct dst_entry *dst; struct in6_addr saddr; struct socket *sock = rcu_dereference(bareudp->sock); dst = udp_tunnel6_dst_lookup(skb, dev, bareudp->net, sock, 0, &saddr, &info->key, sport, bareudp->port, info->key.tos, use_cache ? &info->dst_cache : NULL); if (IS_ERR(dst)) return PTR_ERR(dst); dst_release(dst); info->key.u.ipv6.src = saddr; } else { return -EINVAL; } info->key.tp_src = sport; info->key.tp_dst = bareudp->port; return 0; } static const struct net_device_ops bareudp_netdev_ops = { .ndo_init = bareudp_init, .ndo_uninit = bareudp_uninit, .ndo_open = bareudp_open, .ndo_stop = bareudp_stop, .ndo_start_xmit = bareudp_xmit, .ndo_fill_metadata_dst = bareudp_fill_metadata_dst, }; static const struct nla_policy bareudp_policy[IFLA_BAREUDP_MAX + 1] = { [IFLA_BAREUDP_PORT] = { .type = NLA_U16 }, [IFLA_BAREUDP_ETHERTYPE] = { .type = NLA_U16 }, [IFLA_BAREUDP_SRCPORT_MIN] = { .type = NLA_U16 }, [IFLA_BAREUDP_MULTIPROTO_MODE] = { .type = NLA_FLAG }, }; /* Info for udev, that this is a virtual tunnel endpoint */ static const struct device_type bareudp_type = { .name = "bareudp", }; /* Initialize the device structure. */ static void bareudp_setup(struct net_device *dev) { dev->netdev_ops = &bareudp_netdev_ops; dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &bareudp_type); dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = ETH_DATA_LEN; dev->min_mtu = IPV4_MIN_MTU; dev->max_mtu = IP_MAX_MTU - BAREUDP_BASE_HLEN; dev->type = ARPHRD_NONE; netif_keep_dst(dev); dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int bareudp_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (!data) { NL_SET_ERR_MSG(extack, "Not enough attributes provided to perform the operation"); return -EINVAL; } return 0; } static int bareudp2info(struct nlattr *data[], struct bareudp_conf *conf, struct netlink_ext_ack *extack) { memset(conf, 0, sizeof(*conf)); if (!data[IFLA_BAREUDP_PORT]) { NL_SET_ERR_MSG(extack, "port not specified"); return -EINVAL; } if (!data[IFLA_BAREUDP_ETHERTYPE]) { NL_SET_ERR_MSG(extack, "ethertype not specified"); return -EINVAL; } conf->port = nla_get_u16(data[IFLA_BAREUDP_PORT]); conf->ethertype = nla_get_u16(data[IFLA_BAREUDP_ETHERTYPE]); if (data[IFLA_BAREUDP_SRCPORT_MIN]) conf->sport_min = nla_get_u16(data[IFLA_BAREUDP_SRCPORT_MIN]); if (data[IFLA_BAREUDP_MULTIPROTO_MODE]) conf->multi_proto_mode = true; return 0; } static struct bareudp_dev *bareudp_find_dev(struct bareudp_net *bn, const struct bareudp_conf *conf) { struct bareudp_dev *bareudp, *t = NULL; list_for_each_entry(bareudp, &bn->bareudp_list, next) { if (conf->port == bareudp->port) t = bareudp; } return t; } static int bareudp_configure(struct net *net, struct net_device *dev, struct bareudp_conf *conf, struct netlink_ext_ack *extack) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); struct bareudp_dev *t, *bareudp = netdev_priv(dev); int err; bareudp->net = net; bareudp->dev = dev; t = bareudp_find_dev(bn, conf); if (t) { NL_SET_ERR_MSG(extack, "Another bareudp device using the same port already exists"); return -EBUSY; } if (conf->multi_proto_mode && (conf->ethertype != htons(ETH_P_MPLS_UC) && conf->ethertype != htons(ETH_P_IP))) { NL_SET_ERR_MSG(extack, "Cannot set multiproto mode for this ethertype (only IPv4 and unicast MPLS are supported)"); return -EINVAL; } bareudp->port = conf->port; bareudp->ethertype = conf->ethertype; bareudp->sport_min = conf->sport_min; bareudp->multi_proto_mode = conf->multi_proto_mode; err = register_netdevice(dev); if (err) return err; list_add(&bareudp->next, &bn->bareudp_list); return 0; } static int bareudp_link_config(struct net_device *dev, struct nlattr *tb[]) { int err; if (tb[IFLA_MTU]) { err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU])); if (err) return err; } return 0; } static void bareudp_dellink(struct net_device *dev, struct list_head *head) { struct bareudp_dev *bareudp = netdev_priv(dev); list_del(&bareudp->next); unregister_netdevice_queue(dev, head); } static int bareudp_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct bareudp_conf conf; int err; err = bareudp2info(data, &conf, extack); if (err) return err; err = bareudp_configure(link_net, dev, &conf, extack); if (err) return err; err = bareudp_link_config(dev, tb); if (err) goto err_unconfig; return 0; err_unconfig: bareudp_dellink(dev, NULL); return err; } static size_t bareudp_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_PORT */ nla_total_size(sizeof(__be16)) + /* IFLA_BAREUDP_ETHERTYPE */ nla_total_size(sizeof(__u16)) + /* IFLA_BAREUDP_SRCPORT_MIN */ nla_total_size(0) + /* IFLA_BAREUDP_MULTIPROTO_MODE */ 0; } static int bareudp_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct bareudp_dev *bareudp = netdev_priv(dev); if (nla_put_be16(skb, IFLA_BAREUDP_PORT, bareudp->port)) goto nla_put_failure; if (nla_put_be16(skb, IFLA_BAREUDP_ETHERTYPE, bareudp->ethertype)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BAREUDP_SRCPORT_MIN, bareudp->sport_min)) goto nla_put_failure; if (bareudp->multi_proto_mode && nla_put_flag(skb, IFLA_BAREUDP_MULTIPROTO_MODE)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops bareudp_link_ops __read_mostly = { .kind = "bareudp", .maxtype = IFLA_BAREUDP_MAX, .policy = bareudp_policy, .priv_size = sizeof(struct bareudp_dev), .setup = bareudp_setup, .validate = bareudp_validate, .newlink = bareudp_newlink, .dellink = bareudp_dellink, .get_size = bareudp_get_size, .fill_info = bareudp_fill_info, }; static __net_init int bareudp_init_net(struct net *net) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); INIT_LIST_HEAD(&bn->bareudp_list); return 0; } static void bareudp_destroy_tunnels(struct net *net, struct list_head *head) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); struct bareudp_dev *bareudp, *next; list_for_each_entry_safe(bareudp, next, &bn->bareudp_list, next) unregister_netdevice_queue(bareudp->dev, head); } static void __net_exit bareudp_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_kill_list) { struct net *net; list_for_each_entry(net, net_list, exit_list) bareudp_destroy_tunnels(net, dev_kill_list); } static struct pernet_operations bareudp_net_ops = { .init = bareudp_init_net, .exit_batch_rtnl = bareudp_exit_batch_rtnl, .id = &bareudp_net_id, .size = sizeof(struct bareudp_net), }; static int __init bareudp_init_module(void) { int rc; rc = register_pernet_subsys(&bareudp_net_ops); if (rc) goto out1; rc = rtnl_link_register(&bareudp_link_ops); if (rc) goto out2; return 0; out2: unregister_pernet_subsys(&bareudp_net_ops); out1: return rc; } late_initcall(bareudp_init_module); static void __exit bareudp_cleanup_module(void) { rtnl_link_unregister(&bareudp_link_ops); unregister_pernet_subsys(&bareudp_net_ops); } module_exit(bareudp_cleanup_module); MODULE_ALIAS_RTNL_LINK("bareudp"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Varghese <martin.varghese@nokia.com>"); MODULE_DESCRIPTION("Interface driver for UDP encapsulated traffic"); |
18 175 2924 163 163 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/bad_inode.c * * Copyright (C) 1997, Stephen Tweedie * * Provide stub functions for unreadable inodes * * Fabian Frederick : August 2003 - All file operations assigned to EIO */ #include <linux/fs.h> #include <linux/export.h> #include <linux/stat.h> #include <linux/time.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/fiemap.h> static int bad_file_open(struct inode *inode, struct file *filp) { return -EIO; } static const struct file_operations bad_file_ops = { .open = bad_file_open, }; static int bad_inode_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return -EIO; } static struct dentry *bad_inode_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-EIO); } static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { return -EIO; } static struct dentry *bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return ERR_PTR(-EIO); } static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } static int bad_inode_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EIO; } static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, int buflen) { return -EIO; } static int bad_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return -EIO; } static int bad_inode_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } static int bad_inode_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { return -EIO; } static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { return -EIO; } static const char *bad_inode_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return ERR_PTR(-EIO); } static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu) { return ERR_PTR(-EIO); } static int bad_inode_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { return -EIO; } static int bad_inode_update_time(struct inode *inode, int flags) { return -EIO; } static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, struct file *file, unsigned int open_flag, umode_t create_mode) { return -EIO; } static int bad_inode_tmpfile(struct mnt_idmap *idmap, struct inode *inode, struct file *file, umode_t mode) { return -EIO; } static int bad_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { return -EIO; } static const struct inode_operations bad_inode_ops = { .create = bad_inode_create, .lookup = bad_inode_lookup, .link = bad_inode_link, .unlink = bad_inode_unlink, .symlink = bad_inode_symlink, .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, .rename = bad_inode_rename2, .readlink = bad_inode_readlink, .permission = bad_inode_permission, .getattr = bad_inode_getattr, .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, .get_link = bad_inode_get_link, .get_inode_acl = bad_inode_get_acl, .fiemap = bad_inode_fiemap, .update_time = bad_inode_update_time, .atomic_open = bad_inode_atomic_open, .tmpfile = bad_inode_tmpfile, .set_acl = bad_inode_set_acl, }; /* * When a filesystem is unable to read an inode due to an I/O error in * its read_inode() function, it can call make_bad_inode() to return a * set of stubs which will return EIO errors as required. * * We only need to do limited initialisation: all other fields are * preinitialised to zero automatically. */ /** * make_bad_inode - mark an inode bad due to an I/O error * @inode: Inode to mark bad * * When an inode cannot be read due to a media or remote network * failure this function makes the inode "bad" and causes I/O operations * on it to fail from this point on. */ void make_bad_inode(struct inode *inode) { remove_inode_hash(inode); inode->i_mode = S_IFREG; simple_inode_init_ts(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; } EXPORT_SYMBOL(make_bad_inode); /* * This tests whether an inode has been flagged as bad. The test uses * &bad_inode_ops to cover the case of invalidated inodes as well as * those created by make_bad_inode() above. */ /** * is_bad_inode - is an inode errored * @inode: inode to test * * Returns true if the inode in question has been marked as bad. */ bool is_bad_inode(struct inode *inode) { return (inode->i_op == &bad_inode_ops); } EXPORT_SYMBOL(is_bad_inode); /** * iget_failed - Mark an under-construction inode as dead and release it * @inode: The inode to discard * * Mark an under-construction inode as dead and release it. */ void iget_failed(struct inode *inode) { make_bad_inode(inode); unlock_new_inode(inode); iput(inode); } EXPORT_SYMBOL(iget_failed); |
149 18 138 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_TIMEOUT_H #define _NF_CONNTRACK_TIMEOUT_H #include <net/net_namespace.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <linux/refcount.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #define CTNL_TIMEOUT_NAME_MAX 32 struct nf_ct_timeout { __u16 l3num; const struct nf_conntrack_l4proto *l4proto; char data[]; }; struct nf_conn_timeout { struct nf_ct_timeout __rcu *timeout; }; static inline unsigned int * nf_ct_timeout_data(const struct nf_conn_timeout *t) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_ct_timeout *timeout; timeout = rcu_dereference(t->timeout); if (timeout == NULL) return NULL; return (unsigned int *)timeout->data; #else return NULL; #endif } static inline struct nf_conn_timeout *nf_ct_timeout_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT return nf_ct_ext_find(ct, NF_CT_EXT_TIMEOUT); #else return NULL; #endif } static inline struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct, struct nf_ct_timeout *timeout, gfp_t gfp) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; timeout_ext = nf_ct_ext_add(ct, NF_CT_EXT_TIMEOUT, gfp); if (timeout_ext == NULL) return NULL; rcu_assign_pointer(timeout_ext->timeout, timeout); return timeout_ext; #else return NULL; #endif }; static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) { unsigned int *timeouts = NULL; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; timeout_ext = nf_ct_timeout_find(ct); if (timeout_ext) timeouts = nf_ct_timeout_data(timeout_ext); #endif return timeouts; } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout); int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name); void nf_ct_destroy_timeout(struct nf_conn *ct); #else static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name) { return -EOPNOTSUPP; } static inline void nf_ct_destroy_timeout(struct nf_conn *ct) { return; } #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_ct_timeout_hooks { struct nf_ct_timeout *(*timeout_find_get)(struct net *net, const char *name); void (*timeout_put)(struct nf_ct_timeout *timeout); }; extern const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook; #endif #endif /* _NF_CONNTRACK_TIMEOUT_H */ |
1322 251 2338 2002 2006 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGALLOC_H #define _ASM_X86_PGALLOC_H #include <linux/threads.h> #include <linux/mm.h> /* for struct page */ #include <linux/pagemap.h> #include <asm/cpufeature.h> #define __HAVE_ARCH_PTE_ALLOC_ONE #define __HAVE_ARCH_PGD_FREE #include <asm-generic/pgalloc.h> static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm) static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {} static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) {} static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_release_pte(unsigned long pfn) {} static inline void paravirt_release_pmd(unsigned long pfn) {} static inline void paravirt_release_pud(unsigned long pfn) {} static inline void paravirt_release_p4d(unsigned long pfn) {} #endif /* * In case of Page Table Isolation active, we acquire two PGDs instead of one. * Being order-1, it is both 8k in size and 8k-aligned. That lets us just * flip bit 12 in a pointer to swap between the two 4k halves. */ static inline unsigned int pgd_allocation_order(void) { if (cpu_feature_enabled(X86_FEATURE_PTI)) return 1; return 0; } /* * Allocate and free page tables. */ extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); extern pgtable_t pte_alloc_one(struct mm_struct *); extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, unsigned long address) { ___pte_free_tlb(tlb, pte); } static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); } static inline void pmd_populate_kernel_safe(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); set_pmd_safe(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { unsigned long pfn = page_to_pfn(pte); paravirt_alloc_pte(mm, pfn); set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE)); } #if CONFIG_PGTABLE_LEVELS > 2 extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, unsigned long address) { ___pmd_free_tlb(tlb, pmd); } #ifdef CONFIG_X86_PAE extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); #else /* !CONFIG_X86_PAE */ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); } static inline void pud_populate_safe(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); set_pud_safe(pud, __pud(_PAGE_TABLE | __pa(pmd))); } #endif /* CONFIG_X86_PAE */ #if CONFIG_PGTABLE_LEVELS > 3 static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) { paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud))); } static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) { paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud))); } extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, unsigned long address) { ___pud_free_tlb(tlb, pud); } #if CONFIG_PGTABLE_LEVELS > 4 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) { if (!pgtable_l5_enabled()) return; paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) { if (!pgtable_l5_enabled()) return; paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long address) { if (pgtable_l5_enabled()) ___p4d_free_tlb(tlb, p4d); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* _ASM_X86_PGALLOC_H */ |
22 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _PFXLEN_H #define _PFXLEN_H #include <asm/byteorder.h> #include <linux/netfilter.h> #include <net/tcp.h> /* Prefixlen maps, by Jan Engelhardt */ extern const union nf_inet_addr ip_set_netmask_map[]; extern const union nf_inet_addr ip_set_hostmask_map[]; static inline __be32 ip_set_netmask(u8 pfxlen) { return ip_set_netmask_map[pfxlen].ip; } static inline const __be32 * ip_set_netmask6(u8 pfxlen) { return &ip_set_netmask_map[pfxlen].ip6[0]; } static inline u32 ip_set_hostmask(u8 pfxlen) { return (__force u32) ip_set_hostmask_map[pfxlen].ip; } static inline const __be32 * ip_set_hostmask6(u8 pfxlen) { return &ip_set_hostmask_map[pfxlen].ip6[0]; } extern u32 ip_set_range_to_cidr(u32 from, u32 to, u8 *cidr); #define ip_set_mask_from_to(from, to, cidr) \ do { \ from &= ip_set_hostmask(cidr); \ to = from | ~ip_set_hostmask(cidr); \ } while (0) static inline void ip6_netmask(union nf_inet_addr *ip, u8 prefix) { ip->ip6[0] &= ip_set_netmask6(prefix)[0]; ip->ip6[1] &= ip_set_netmask6(prefix)[1]; ip->ip6[2] &= ip_set_netmask6(prefix)[2]; ip->ip6[3] &= ip_set_netmask6(prefix)[3]; } #endif /*_PFXLEN_H */ |
3 23 1 18 4 22 3 10 13 10 10 9 8 8 2 7 2 1 6 9 22 14 15 9 6 15 1 7 14 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 | // SPDX-License-Identifier: GPL-2.0-or-later /* * udp_diag.c Module for monitoring UDP transport protocols sockets. * * Authors: Pavel Emelyanov, <xemul@parallels.com> */ #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/udp.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/sock_diag.h> static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, net_admin); } static int udp_dump_one(struct udp_table *tbl, struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; int err; struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); rcu_read_lock(); if (req->sdiag_family == AF_INET) /* src and dst are swapped for historical reasons */ sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #endif if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); err = -ENOENT; if (!sk) goto out_nosk; err = sock_diag_check_cookie(sk, req->id.idiag_cookie); if (err) goto out; err = -ENOMEM; rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) goto out; err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); goto out; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); out: if (sk) sock_put(sk); out_nosk: return err; } static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct nlattr *bc; cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { struct udp_hslot *hslot = &table->hash[slot]; struct sock *sk; num = 0; if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); sk_for_each(sk, &hslot->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (!(r->idiag_states & (1 << sk->sk_state))) goto next; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } next: num++; } spin_unlock_bh(&hslot->lock); } done: cb->args[0] = slot; cb->args[1] = num; } static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r); } static int udp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req); } static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = udp_rqueue_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int __udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req, struct udp_table *tbl) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) sk = __udp4_lib_lookup(net, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); else sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); } #endif else { rcu_read_unlock(); return -EINVAL; } if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); if (!sk) return -ENOENT; if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { sock_put(sk); return -ENOENT; } err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } static int udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table); } static int udplite_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, &udplite_table); } #endif static const struct inet_diag_handler udp_diag_handler = { .owner = THIS_MODULE, .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udp_diag_destroy, #endif }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(&udplite_table, skb, cb, r); } static int udplite_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(&udplite_table, cb, req); } static const struct inet_diag_handler udplite_diag_handler = { .owner = THIS_MODULE, .dump = udplite_diag_dump, .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udplite_diag_destroy, #endif }; static int __init udp_diag_init(void) { int err; err = inet_diag_register(&udp_diag_handler); if (err) goto out; err = inet_diag_register(&udplite_diag_handler); if (err) goto out_lite; out: return err; out_lite: inet_diag_unregister(&udp_diag_handler); goto out; } static void __exit udp_diag_exit(void) { inet_diag_unregister(&udplite_diag_handler); inet_diag_unregister(&udp_diag_handler); } module_init(udp_diag_init); module_exit(udp_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("UDP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */); |
314 64 257 316 27 15 15 52 18 13 13 76 2 4 4 52 47 4 31 8 18 7 18 52 14 23 67 3601 2514 1 1 144 1530 60 798 63 782 776 782 780 1 390 376 270 165 320 780 781 33 2 32 2 2 71 71 1 317 318 317 317 319 319 2 317 318 318 317 30 294 5 5 3 2 3 2 3 3 3 3 304 304 304 301 275 29 304 13 2 11 11 2 11 304 29 277 304 7 42 16 16 3 1 12 2 6 3 7 1 5 5 3 3 3 3 2 5 1 5 5 5 3 9 9 1 1 7 2 2 3 1 1 3 1 2 2 1 1 1 18 18 18 44 43 5 2 37 37 36 32 13 30 11 8 3 13 26 7 9 25 29 8 1 36 23 32 5 35 35 14 17 17 17 7 15 17 17 26 2 8 6 2 17 13 18 18 37 36 36 26 26 5 4 13 4 4 2 98 2 99 100 97 96 97 1 1 16 9 1 44 26 4064 3943 396 1253 1240 1240 1019 1004 1003 716 354 865 293 1184 3 469 469 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Neighbour Discovery for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Mike Shaver <shaver@ingenia.com> */ /* * Changes: * * Alexey I. Froloff : RFC6106 (DNSSL) support * Pierre Ynard : export userland ND options * through netlink (RDNSS support) * Lars Fenneberg : fixed MTU setting on receipt * of an RA. * Janos Farkas : kmalloc failure checks * Alexey Kuznetsov : state machine reworked * and moved to net/core. * Pekka Savola : RFC2461 validation * YOSHIFUJI Hideaki @USAGI : Verify ND options properly */ #define pr_fmt(fmt) "ICMPv6: " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/sched.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/route.h> #include <linux/init.h> #include <linux/rcupdate.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/if_addr.h> #include <linux/if_ether.h> #include <linux/if_arp.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/jhash.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/icmp.h> #include <net/netlink.h> #include <linux/rtnetlink.h> #include <net/flow.h> #include <net/ip6_checksum.h> #include <net/inet_common.h> #include <linux/proc_fs.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey); static bool ndisc_allow_add(const struct net_device *dev, struct netlink_ext_ack *extack); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); static int pndisc_constructor(struct pneigh_entry *n); static void pndisc_destructor(struct pneigh_entry *n); static void pndisc_redo(struct sk_buff *skb); static int ndisc_is_multicast(const void *pkey); static const struct neigh_ops ndisc_generic_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops ndisc_hh_ops = { .family = AF_INET6, .solicit = ndisc_solicit, .error_report = ndisc_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops ndisc_direct_ops = { .family = AF_INET6, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; struct neigh_table nd_tbl = { .family = AF_INET6, .key_len = sizeof(struct in6_addr), .protocol = cpu_to_be16(ETH_P_IPV6), .hash = ndisc_hash, .key_eq = ndisc_key_eq, .constructor = ndisc_constructor, .pconstructor = pndisc_constructor, .pdestructor = pndisc_destructor, .proxy_redo = pndisc_redo, .is_multicast = ndisc_is_multicast, .allow_add = ndisc_allow_add, .id = "ndisc_cache", .parms = { .tbl = &nd_tbl, .reachable_time = ND_REACHABLE_TIME, .data = { [NEIGH_VAR_MCAST_PROBES] = 3, [NEIGH_VAR_UCAST_PROBES] = 3, [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER, [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, }; EXPORT_SYMBOL_GPL(nd_tbl); void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, int data_len, int pad) { int space = __ndisc_opt_addr_space(data_len, pad); u8 *opt = skb_put(skb, space); opt[0] = type; opt[1] = space>>3; memset(opt + 2, 0, pad); opt += pad; space -= pad; memcpy(opt+2, data, data_len); data_len += 2; opt += data_len; space -= data_len; if (space > 0) memset(opt, 0, space); } EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option); static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, u8 icmp6_type) { __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len, ndisc_addr_option_pad(skb->dev->type)); ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type); } static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb, void *ha, const u8 *ops_data) { ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT); ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data); } static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { int type; if (!cur || !end || cur >= end) return NULL; type = cur->nd_opt_type; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while (cur < end && cur->nd_opt_type != type); return cur <= end && cur->nd_opt_type == type ? cur : NULL; } static inline int ndisc_is_useropt(const struct net_device *dev, struct nd_opt_hdr *opt) { return opt->nd_opt_type == ND_OPT_PREFIX_INFO || opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || opt->nd_opt_type == ND_OPT_6CO || opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || opt->nd_opt_type == ND_OPT_PREF64; } static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev, struct nd_opt_hdr *cur, struct nd_opt_hdr *end) { if (!cur || !end || cur >= end) return NULL; do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while (cur < end && !ndisc_is_useropt(dev, cur)); return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL; } struct ndisc_options *ndisc_parse_options(const struct net_device *dev, u8 *opt, int opt_len, struct ndisc_options *ndopts) { struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; if (!nd_opt || opt_len < 0 || !ndopts) return NULL; memset(ndopts, 0, sizeof(*ndopts)); while (opt_len) { bool unknown = false; int l; if (opt_len < sizeof(struct nd_opt_hdr)) return NULL; l = nd_opt->nd_opt_len << 3; if (opt_len < l || l == 0) return NULL; if (ndisc_ops_parse_options(dev, nd_opt, ndopts)) goto next_opt; switch (nd_opt->nd_opt_type) { case ND_OPT_SOURCE_LL_ADDR: case ND_OPT_TARGET_LL_ADDR: case ND_OPT_MTU: case ND_OPT_NONCE: case ND_OPT_REDIRECT_HDR: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { ND_PRINTK(2, warn, "%s: duplicated ND6 option found: type=%d\n", __func__, nd_opt->nd_opt_type); } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } break; case ND_OPT_PREFIX_INFO: ndopts->nd_opts_pi_end = nd_opt; if (!ndopts->nd_opt_array[nd_opt->nd_opt_type]) ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; break; #ifdef CONFIG_IPV6_ROUTE_INFO case ND_OPT_ROUTE_INFO: ndopts->nd_opts_ri_end = nd_opt; if (!ndopts->nd_opts_ri) ndopts->nd_opts_ri = nd_opt; break; #endif default: unknown = true; } if (ndisc_is_useropt(dev, nd_opt)) { ndopts->nd_useropts_end = nd_opt; if (!ndopts->nd_useropts) ndopts->nd_useropts = nd_opt; } else if (unknown) { /* * Unknown options must be silently ignored, * to accommodate future extension to the * protocol. */ ND_PRINTK(2, notice, "%s: ignored unsupported option; type=%d, len=%d\n", __func__, nd_opt->nd_opt_type, nd_opt->nd_opt_len); } next_opt: opt_len -= l; nd_opt = ((void *)nd_opt) + l; } return ndopts; } int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_IEEE802: /* Not sure. Check it later. --ANK */ case ARPHRD_FDDI: ipv6_eth_mc_map(addr, buf); return 0; case ARPHRD_ARCNET: ipv6_arcnet_mc_map(addr, buf); return 0; case ARPHRD_INFINIBAND: ipv6_ib_mc_map(addr, dev->broadcast, buf); return 0; case ARPHRD_IPGRE: return ipv6_ipgre_mc_map(addr, dev->broadcast, buf); default: if (dir) { memcpy(buf, dev->broadcast, dev->addr_len); return 0; } } return -EINVAL; } EXPORT_SYMBOL(ndisc_mc_map); static u32 ndisc_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { return ndisc_hashfn(pkey, dev, hash_rnd); } static bool ndisc_key_eq(const struct neighbour *n, const void *pkey) { return neigh_key_eq128(n, pkey); } static int ndisc_constructor(struct neighbour *neigh) { struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key; struct net_device *dev = neigh->dev; struct inet6_dev *in6_dev; struct neigh_parms *parms; bool is_multicast = ipv6_addr_is_multicast(addr); in6_dev = in6_dev_get(dev); if (!in6_dev) { return -EINVAL; } parms = in6_dev->nd_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &ndisc_direct_ops; neigh->output = neigh_direct_output; } else { if (is_multicast) { neigh->nud_state = NUD_NOARP; ndisc_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); if (dev->flags&IFF_LOOPBACK) neigh->type = RTN_LOCAL; } else if (dev->flags&IFF_POINTOPOINT) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) neigh->ops = &ndisc_hh_ops; else neigh->ops = &ndisc_generic_ops; if (neigh->nud_state&NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } in6_dev_put(in6_dev); return 0; } static int pndisc_constructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; if (!dev || !__in6_dev_get(dev)) return -EINVAL; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_inc(dev, &maddr); return 0; } static void pndisc_destructor(struct pneigh_entry *n) { struct in6_addr *addr = (struct in6_addr *)&n->key; struct in6_addr maddr; struct net_device *dev = n->dev; if (!dev || !__in6_dev_get(dev)) return; addrconf_addr_solict_mult(addr, &maddr); ipv6_dev_mc_dec(dev, &maddr); } /* called with rtnl held */ static bool ndisc_allow_add(const struct net_device *dev, struct netlink_ext_ack *extack) { struct inet6_dev *idev = __in6_dev_get(dev); if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on this device"); return false; } return true; } static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, int len) { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; struct sk_buff *skb; skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); if (!skb) return NULL; skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; skb_reserve(skb, hlen + sizeof(struct ipv6hdr)); skb_reset_transport_header(skb); /* Manually assign socket ownership as we avoid calling * sock_alloc_send_pskb() to bypass wmem buffer limits */ rcu_read_lock(); skb_set_owner_w(skb, dev_net_rcu(dev)->ipv6.ndisc_sk); rcu_read_unlock(); return skb; } static void ip6_nd_hdr(struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int hop_limit, int len) { struct ipv6hdr *hdr; struct inet6_dev *idev; unsigned tclass; rcu_read_lock(); idev = __in6_dev_get(skb->dev); tclass = idev ? READ_ONCE(idev->cnf.ndisc_tclass) : 0; rcu_read_unlock(); skb_push(skb, sizeof(*hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); ip6_flow_hdr(hdr, tclass, 0); hdr->payload_len = htons(len); hdr->nexthdr = IPPROTO_ICMPV6; hdr->hop_limit = hop_limit; hdr->saddr = *saddr; hdr->daddr = *daddr; } void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct icmp6hdr *icmp6h = icmp6_hdr(skb); struct dst_entry *dst = skb_dst(skb); struct inet6_dev *idev; struct net *net; struct sock *sk; int err; u8 type; type = icmp6h->icmp6_type; rcu_read_lock(); net = dev_net_rcu(skb->dev); sk = net->ipv6.ndisc_sk; if (!dst) { struct flowi6 fl6; int oif = skb->dev->ifindex; icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { rcu_read_unlock(); kfree_skb(skb); return; } skb_dst_set(skb, dst); } icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, csum_partial(icmp6h, skb->len, 0)); ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); idev = __in6_dev_get(dst->dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, dst->dev, dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } rcu_read_unlock(); } EXPORT_SYMBOL(ndisc_send_skb); void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt) { struct sk_buff *skb; struct in6_addr tmpaddr; struct inet6_ifaddr *ifp; const struct in6_addr *src_addr; struct nd_msg *msg; int optlen = 0; /* for anycast or proxy, solicited_addr != src_addr */ ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1); if (ifp) { src_addr = solicited_addr; if (ifp->flags & IFA_F_OPTIMISTIC) override = false; inc_opt |= READ_ONCE(ifp->idev->cnf.force_tllao); in6_ifa_put(ifp); } else { if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr, inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs, &tmpaddr)) return; src_addr = &tmpaddr; } if (!dev->addr_len) inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_ADVERTISEMENT); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); *msg = (struct nd_msg) { .icmph = { .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT, .icmp6_router = router, .icmp6_solicited = solicited, .icmp6_override = override, }, .target = *solicited_addr, }; if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_ADVERTISEMENT); ndisc_send_skb(skb, daddr, src_addr); } static void ndisc_send_unsol_na(struct net_device *dev) { struct inet6_dev *idev; struct inet6_ifaddr *ifa; idev = in6_dev_get(dev); if (!idev) return; read_lock_bh(&idev->lock); list_for_each_entry(ifa, &idev->addr_list, if_list) { /* skip tentative addresses until dad completes */ if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_OPTIMISTIC)) continue; ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr, /*router=*/ !!idev->cnf.forwarding, /*solicited=*/ false, /*override=*/ true, /*inc_opt=*/ true); } read_unlock_bh(&idev->lock); in6_dev_put(idev); } struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *saddr, u64 nonce) { int inc_opt = dev->addr_len; struct sk_buff *skb; struct nd_msg *msg; int optlen = 0; if (!saddr) return NULL; if (ipv6_addr_any(saddr)) inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_SOLICITATION); if (nonce != 0) optlen += 8; skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return NULL; msg = skb_put(skb, sizeof(*msg)); *msg = (struct nd_msg) { .icmph = { .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION, }, .target = *solicit, }; if (inc_opt) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_NEIGHBOUR_SOLICITATION); if (nonce != 0) { u8 *opt = skb_put(skb, 8); opt[0] = ND_OPT_NONCE; opt[1] = 8 >> 3; memcpy(opt + 2, &nonce, 6); } return skb; } EXPORT_SYMBOL(ndisc_ns_create); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr, u64 nonce) { struct in6_addr addr_buf; struct sk_buff *skb; if (!saddr) { if (ipv6_get_lladdr(dev, &addr_buf, (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC))) return; saddr = &addr_buf; } skb = ndisc_ns_create(dev, solicit, saddr, nonce); if (skb) ndisc_send_skb(skb, daddr, saddr); } void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr) { struct sk_buff *skb; struct rs_msg *msg; int send_sllao = dev->addr_len; int optlen = 0; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD /* * According to section 2.2 of RFC 4429, we must not * send router solicitations with a sllao from * optimistic addresses, but we may send the solicitation * if we don't include the sllao. So here we check * if our address is optimistic, and if so, we * suppress the inclusion of the sllao. */ if (send_sllao) { struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr, dev, 1); if (ifp) { if (ifp->flags & IFA_F_OPTIMISTIC) { send_sllao = 0; } in6_ifa_put(ifp); } else { send_sllao = 0; } } #endif if (send_sllao) optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION); skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); *msg = (struct rs_msg) { .icmph = { .icmp6_type = NDISC_ROUTER_SOLICITATION, }, }; if (send_sllao) ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, NDISC_ROUTER_SOLICITATION); ndisc_send_skb(skb, daddr, saddr); } static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) { /* * "The sender MUST return an ICMP * destination unreachable" */ dst_link_failure(skb); kfree_skb(skb); } /* Called with locked neigh: either read or both */ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) { struct in6_addr *saddr = NULL; struct in6_addr mcaddr; struct net_device *dev = neigh->dev; struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; int probes = atomic_read(&neigh->probes); if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr, dev, false, 1, IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) saddr = &ipv6_hdr(skb)->saddr; probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) { ND_PRINTK(1, dbg, "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } ndisc_send_ns(dev, target, target, saddr, 0); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); ndisc_send_ns(dev, target, &mcaddr, saddr, 0); } } static int pndisc_is_router(const void *pkey, struct net_device *dev) { struct pneigh_entry *n; int ret = -1; read_lock_bh(&nd_tbl.lock); n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); if (n) ret = !!(n->flags & NTF_ROUTER); read_unlock_bh(&nd_tbl.lock); return ret; } void ndisc_update(const struct net_device *dev, struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, struct ndisc_options *ndopts) { neigh_update(neigh, lladdr, new, flags, 0); /* report ndisc ops about neighbour update */ ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); } static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_ifaddr *ifp; struct inet6_dev *idev = NULL; struct neighbour *neigh; int dad = ipv6_addr_any(saddr); int is_router = -1; SKB_DR(reason); u64 nonce = 0; bool inc; if (skb->len < sizeof(struct nd_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NS: multicast target address\n"); return reason; } /* * RFC2461 7.1.1: * DAD has to be destined for solicited node multicast address. */ if (dad && !ipv6_addr_is_solict_mult(daddr)) { ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); return reason; } if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NS: invalid link-layer address length\n"); return reason; } /* RFC2461 7.1.1: * If the IP source address is the unspecified address, * there MUST NOT be source link-layer address option * in the message. */ if (dad) { ND_PRINTK(2, warn, "NS: bad DAD packet (link-layer address option)\n"); return reason; } } if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1) memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6); inc = ipv6_addr_is_multicast(daddr); ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { have_ifp: if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) { if (dad) { if (nonce != 0 && ifp->dad_nonce == nonce) { u8 *np = (u8 *)&nonce; /* Matching nonce if looped back */ ND_PRINTK(2, notice, "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", ifp->idev->dev->name, &ifp->addr, np); goto out; } /* * We are colliding with another node * who is doing DAD * so fail our DAD process */ addrconf_dad_failure(skb, ifp); return reason; } else { /* * This is not a dad solicitation. * If we are an optimistic node, * we should respond. * Otherwise, we should ignore it. */ if (!(ifp->flags & IFA_F_OPTIMISTIC)) goto out; } } idev = ifp->idev; } else { struct net *net = dev_net(dev); /* perhaps an address on the master device */ if (netif_is_l3_slave(dev)) { struct net_device *mdev; mdev = netdev_master_upper_dev_get_rcu(dev); if (mdev) { ifp = ipv6_get_ifaddr(net, &msg->target, mdev, 1); if (ifp) goto have_ifp; } } idev = in6_dev_get(dev); if (!idev) { /* XXX: count this drop? */ return reason; } if (ipv6_chk_acast_addr(net, dev, &msg->target) || (READ_ONCE(idev->cnf.forwarding) && (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) || READ_ONCE(idev->cnf.proxy_ndp)) && (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) { if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) && skb->pkt_type != PACKET_HOST && inc && NEIGH_VAR(idev->nd_parms, PROXY_DELAY) != 0) { /* * for anycast or proxy, * sender should delay its response * by a random time between 0 and * MAX_ANYCAST_DELAY_TIME seconds. * (RFC2461) -- yoshfuji */ struct sk_buff *n = skb_clone(skb, GFP_ATOMIC); if (n) pneigh_enqueue(&nd_tbl, idev->nd_parms, n); goto out; } } else { SKB_DR_SET(reason, IPV6_NDISC_NS_OTHERHOST); goto out; } } if (is_router < 0) is_router = READ_ONCE(idev->cnf.forwarding); if (dad) { ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target, !!is_router, false, (ifp != NULL), true); goto out; } if (inc) NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast); else NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast); /* * update / create cache entry * for the source address */ neigh = __neigh_lookup(&nd_tbl, saddr, dev, !inc || lladdr || !dev->addr_len); if (neigh) ndisc_update(dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE, NDISC_NEIGHBOUR_SOLICITATION, &ndopts); if (neigh || !dev->header_ops) { ndisc_send_na(dev, saddr, &msg->target, !!is_router, true, (ifp != NULL && inc), inc); if (neigh) neigh_release(neigh); reason = SKB_CONSUMED; } out: if (ifp) in6_ifa_put(ifp); else in6_dev_put(idev); return reason; } static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr) { struct inet6_dev *idev = __in6_dev_get(dev); switch (READ_ONCE(idev->cnf.accept_untracked_na)) { case 0: /* Don't accept untracked na (absent in neighbor cache) */ return 0; case 1: /* Create new entries from na if currently untracked */ return 1; case 2: /* Create new entries from untracked na only if saddr is in the * same subnet as an address configured on the interface that * received the na */ return !!ipv6_chk_prefix(saddr, dev); default: return 0; } } static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) { struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb); struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; u8 *lladdr = NULL; u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct nd_msg, opt)); struct ndisc_options ndopts; struct net_device *dev = skb->dev; struct inet6_dev *idev = __in6_dev_get(dev); struct inet6_ifaddr *ifp; struct neighbour *neigh; SKB_DR(reason); u8 new_state; if (skb->len < sizeof(struct nd_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { ND_PRINTK(2, warn, "NA: target address is multicast\n"); return reason; } if (ipv6_addr_is_multicast(daddr) && msg->icmph.icmp6_solicited) { ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); return reason; } /* For some 802.11 wireless deployments (and possibly other networks), * there will be a NA proxy and unsolicitd packets are attacks * and thus should not be accepted. * drop_unsolicited_na takes precedence over accept_untracked_na */ if (!msg->icmph.icmp6_solicited && idev && READ_ONCE(idev->cnf.drop_unsolicited_na)) return reason; if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { ND_PRINTK(2, warn, "NA: invalid link-layer address length\n"); return reason; } } ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1); if (ifp) { if (skb->pkt_type != PACKET_LOOPBACK && (ifp->flags & IFA_F_TENTATIVE)) { addrconf_dad_failure(skb, ifp); return reason; } /* What should we make now? The advertisement is invalid, but ndisc specs say nothing about it. It could be misconfiguration, or an smart proxy agent tries to help us :-) We should not print the error if NA has been received from loopback - it is just our own unsolicited advertisement. */ if (skb->pkt_type != PACKET_LOOPBACK) ND_PRINTK(1, warn, "NA: %pM advertised our address %pI6c on %s!\n", eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name); in6_ifa_put(ifp); return reason; } neigh = neigh_lookup(&nd_tbl, &msg->target, dev); /* RFC 9131 updates original Neighbour Discovery RFC 4861. * NAs with Target LL Address option without a corresponding * entry in the neighbour cache can now create a STALE neighbour * cache entry on routers. * * entry accept fwding solicited behaviour * ------- ------ ------ --------- ---------------------- * present X X 0 Set state to STALE * present X X 1 Set state to REACHABLE * absent 0 X X Do nothing * absent 1 0 X Do nothing * absent 1 1 X Add a new STALE entry * * Note that we don't do a (daddr == all-routers-mcast) check. */ new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE; if (!neigh && lladdr && idev && READ_ONCE(idev->cnf.forwarding)) { if (accept_untracked_na(dev, saddr)) { neigh = neigh_create(&nd_tbl, &msg->target, dev); new_state = NUD_STALE; } } if (neigh && !IS_ERR(neigh)) { u8 old_flags = neigh->flags; struct net *net = dev_net(dev); if (READ_ONCE(neigh->nud_state) & NUD_FAILED) goto out; /* * Don't update the neighbor cache entry on a proxy NA from * ourselves because either the proxied node is off link or it * has already sent a NA to us. */ if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && READ_ONCE(net->ipv6.devconf_all->forwarding) && READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { /* XXX: idev->cnf.proxy_ndp */ goto out; } ndisc_update(dev, neigh, lladdr, new_state, NEIGH_UPDATE_F_WEAK_OVERRIDE| (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0), NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts); if ((old_flags & ~neigh->flags) & NTF_ROUTER) { /* * Change: router to host */ rt6_clean_tohost(dev_net(dev), saddr); } reason = SKB_CONSUMED; out: neigh_release(neigh); } return reason; } static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb) { struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb); unsigned long ndoptlen = skb->len - sizeof(*rs_msg); struct neighbour *neigh; struct inet6_dev *idev; const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr; struct ndisc_options ndopts; u8 *lladdr = NULL; SKB_DR(reason); if (skb->len < sizeof(*rs_msg)) return SKB_DROP_REASON_PKT_TOO_SMALL; idev = __in6_dev_get(skb->dev); if (!idev) { ND_PRINTK(1, err, "RS: can't find in6 device\n"); return reason; } /* Don't accept RS if we're not in router mode */ if (!READ_ONCE(idev->cnf.forwarding)) goto out; /* * Don't update NCE if src = ::; * this implies that the source node has no ip address assigned yet. */ if (ipv6_addr_any(saddr)) goto out; /* Parse ND options */ if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) goto out; } neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); if (neigh) { ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER, NDISC_ROUTER_SOLICITATION, &ndopts); neigh_release(neigh); reason = SKB_CONSUMED; } out: return reason; } static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt) { struct icmp6hdr *icmp6h = (struct icmp6hdr *)skb_transport_header(ra); struct sk_buff *skb; struct nlmsghdr *nlh; struct nduseroptmsg *ndmsg; struct net *net = dev_net(ra->dev); int err; int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg) + (opt->nd_opt_len << 3)); size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr)); skb = nlmsg_new(msg_size, GFP_ATOMIC); if (!skb) { err = -ENOBUFS; goto errout; } nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0); if (!nlh) { goto nla_put_failure; } ndmsg = nlmsg_data(nlh); ndmsg->nduseropt_family = AF_INET6; ndmsg->nduseropt_ifindex = ra->dev->ifindex; ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type; ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code; ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3; memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3); if (nla_put_in6_addr(skb, NDUSEROPT_SRCADDR, &ipv6_hdr(ra)->saddr)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); err = -EMSGSIZE; errout: rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); } static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) { struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); bool send_ifinfo_notify = false; struct neighbour *neigh = NULL; struct ndisc_options ndopts; struct fib6_info *rt = NULL; struct inet6_dev *in6_dev; struct fib6_table *table; u32 defrtr_usr_metric; unsigned int pref = 0; __u32 old_if_flags; struct net *net; SKB_DR(reason); int lifetime; int optlen; __u8 *opt = (__u8 *)(ra_msg + 1); optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - sizeof(struct ra_msg); ND_PRINTK(2, info, "RA: %s, dev: %s\n", __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "RA: source address is not link-local\n"); return reason; } if (optlen < 0) return SKB_DROP_REASON_PKT_TOO_SMALL; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { ND_PRINTK(2, warn, "RA: from host or unauthorized router\n"); return reason; } #endif in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) { ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", skb->dev->name); return reason; } if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, did not accept ra for dev: %s\n", __func__, skb->dev->name); goto skip_linkparms; } #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { ND_PRINTK(2, info, "RA: %s, nodetype is NODEFAULT, dev: %s\n", __func__, skb->dev->name); goto skip_linkparms; } #endif if (in6_dev->if_flags & IF_RS_SENT) { /* * flag that an RA was received after an RS was sent * out on this interface. */ in6_dev->if_flags |= IF_RA_RCVD; } /* * Remember the managed/otherconf flags from most recently * received RA message (RFC 2462) -- yoshfuji */ old_if_flags = in6_dev->if_flags; in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | IF_RA_OTHERCONF)) | (ra_msg->icmph.icmp6_addrconf_managed ? IF_RA_MANAGED : 0) | (ra_msg->icmph.icmp6_addrconf_other ? IF_RA_OTHERCONF : 0); if (old_if_flags != in6_dev->if_flags) send_ifinfo_notify = true; if (!READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) { ND_PRINTK(2, info, "RA: %s, defrtr is false for dev: %s\n", __func__, skb->dev->name); goto skip_defrtr; } lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); if (lifetime != 0 && lifetime < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) { ND_PRINTK(2, info, "RA: router lifetime (%ds) is too short: %s\n", lifetime, skb->dev->name); goto skip_defrtr; } /* Do not accept RA with source-addr found on local machine unless * accept_ra_from_local is set to true. */ net = dev_net(in6_dev->dev); if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", skb->dev->name); goto skip_defrtr; } #ifdef CONFIG_IPV6_ROUTER_PREF pref = ra_msg->icmph.icmp6_router_pref; /* 10b is handled as if it were 00b (medium) */ if (pref == ICMPV6_ROUTER_PREF_INVALID || !READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref)) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif /* routes added from RAs do not use nexthop objects */ rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); return reason; } } /* Set default route metric as specified by user */ defrtr_usr_metric = in6_dev->cnf.ra_defrtr_metric; /* delete the route if lifetime is 0 or if metric needs change */ if (rt && (lifetime == 0 || rt->fib6_metric != defrtr_usr_metric)) { ip6_del_rt(net, rt, false); rt = NULL; } ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", rt, lifetime, defrtr_usr_metric, skb->dev->name); if (!rt && lifetime) { ND_PRINTK(3, info, "RA: adding default router\n"); if (neigh) neigh_release(neigh); rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev, pref, defrtr_usr_metric, lifetime); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", __func__); return reason; } neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); fib6_info_release(rt); return reason; } neigh->flags |= NTF_ROUTER; } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) { struct nl_info nlinfo = { .nl_net = net, }; rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE); } if (rt) { table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); fib6_set_expires(rt, jiffies + (HZ * lifetime)); fib6_add_gc_list(rt); spin_unlock_bh(&table->tb6_lock); } if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) < 256 && ra_msg->icmph.icmp6_hop_limit) { if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) <= ra_msg->icmph.icmp6_hop_limit) { WRITE_ONCE(in6_dev->cnf.hop_limit, ra_msg->icmph.icmp6_hop_limit); fib6_metric_set(rt, RTAX_HOPLIMIT, ra_msg->icmph.icmp6_hop_limit); } else { ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); } } skip_defrtr: /* * Update Reachable Time and Retrans Timer */ if (in6_dev->nd_parms) { unsigned long rtime = ntohl(ra_msg->retrans_timer); if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { rtime = (rtime*HZ)/1000; if (rtime < HZ/100) rtime = HZ/100; NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } rtime = ntohl(ra_msg->reachable_time); if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) { rtime = (rtime*HZ)/1000; if (rtime < HZ/10) rtime = HZ/10; if (rtime != NEIGH_VAR(in6_dev->nd_parms, BASE_REACHABLE_TIME)) { NEIGH_VAR_SET(in6_dev->nd_parms, BASE_REACHABLE_TIME, rtime); NEIGH_VAR_SET(in6_dev->nd_parms, GC_STALETIME, 3 * rtime); in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } } } skip_linkparms: /* * Process options. */ if (!neigh) neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr, skb->dev, 1); if (neigh) { u8 *lladdr = NULL; if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) { ND_PRINTK(2, warn, "RA: invalid link-layer address length\n"); goto out; } } ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE_ISROUTER| NEIGH_UPDATE_F_ISROUTER, NDISC_ROUTER_ADVERTISEMENT, &ndopts); reason = SKB_CONSUMED; } if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, accept_ra is false for dev: %s\n", __func__, skb->dev->name); goto out; } #ifdef CONFIG_IPV6_ROUTE_INFO if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: router info ignored.\n", skb->dev->name); goto skip_routeinfo; } if (READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref) && ndopts.nd_opts_ri) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_ri; p; p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) { struct route_info *ri = (struct route_info *)p; #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT && ri->prefix_len == 0) continue; #endif if (ri->prefix_len == 0 && !READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) continue; if (ri->lifetime != 0 && ntohl(ri->lifetime) < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) continue; if (ri->prefix_len < READ_ONCE(in6_dev->cnf.accept_ra_rt_info_min_plen)) continue; if (ri->prefix_len > READ_ONCE(in6_dev->cnf.accept_ra_rt_info_max_plen)) continue; rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, &ipv6_hdr(skb)->saddr); } } skip_routeinfo: #endif #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific ndopts from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { ND_PRINTK(2, info, "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", __func__, skb->dev->name); goto out; } #endif if (READ_ONCE(in6_dev->cnf.accept_ra_pinfo) && ndopts.nd_opts_pi) { struct nd_opt_hdr *p; for (p = ndopts.nd_opts_pi; p; p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) { addrconf_prefix_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, ndopts.nd_opts_src_lladdr != NULL); } } if (ndopts.nd_opts_mtu && READ_ONCE(in6_dev->cnf.accept_ra_mtu)) { __be32 n; u32 mtu; memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); mtu = ntohl(n); if (in6_dev->ra_mtu != mtu) { in6_dev->ra_mtu = mtu; send_ifinfo_notify = true; } if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (READ_ONCE(in6_dev->cnf.mtu6) != mtu) { WRITE_ONCE(in6_dev->cnf.mtu6, mtu); fib6_metric_set(rt, RTAX_MTU, mtu); rt6_mtu_change(skb->dev, mtu); } } if (ndopts.nd_useropts) { struct nd_opt_hdr *p; for (p = ndopts.nd_useropts; p; p = ndisc_next_useropt(skb->dev, p, ndopts.nd_useropts_end)) { ndisc_ra_useropt(skb, p); } } if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: /* Send a notify if RA changed managed/otherconf flags or * timer settings or ra_mtu value */ if (send_ifinfo_notify) inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); fib6_info_release(rt); if (neigh) neigh_release(neigh); return reason; } static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb) { struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb); u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) + offsetof(struct rd_msg, opt)); struct ndisc_options ndopts; SKB_DR(reason); u8 *hdr; #ifdef CONFIG_IPV6_NDISC_NODETYPE switch (skb->ndisc_nodetype) { case NDISC_NODETYPE_HOST: case NDISC_NODETYPE_NODEFAULT: ND_PRINTK(2, warn, "Redirect: from host or unauthorized router\n"); return reason; } #endif if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: source address is not link-local\n"); return reason; } if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ndopts.nd_opts_rh) { ip6_redirect_no_header(skb, dev_net(skb->dev), skb->dev->ifindex); return reason; } hdr = (u8 *)ndopts.nd_opts_rh; hdr += 8; if (!pskb_pull(skb, hdr - skb_transport_header(skb))) return SKB_DROP_REASON_PKT_TOO_SMALL; return icmpv6_notify(skb, NDISC_REDIRECT, 0, 0); } static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, struct sk_buff *orig_skb, int rd_len) { u8 *opt = skb_put(skb, rd_len); memset(opt, 0, 8); *(opt++) = ND_OPT_REDIRECT_HDR; *(opt++) = (rd_len >> 3); opt += 6; skb_copy_bits(orig_skb, skb_network_offset(orig_skb), opt, rd_len - 8); } void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) { struct net_device *dev = skb->dev; struct net *net = dev_net_rcu(dev); struct sock *sk = net->ipv6.ndisc_sk; int optlen = 0; struct inet_peer *peer; struct sk_buff *buff; struct rd_msg *msg; struct in6_addr saddr_buf; struct rt6_info *rt; struct dst_entry *dst; struct flowi6 fl6; int rd_len; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL, ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; bool ret; if (netif_is_l3_master(dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) return; } if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", dev->name); return; } if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { ND_PRINTK(2, warn, "Redirect: target address is not link-local unicast\n"); return; } icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); return; } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) return; rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) { ND_PRINTK(2, warn, "Redirect: destination is not a neighbour\n"); goto release; } peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr); ret = inet_peer_xrlim_allow(peer, 1*HZ); if (!ret) goto release; if (dev->addr_len) { struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target); if (!neigh) { ND_PRINTK(2, warn, "Redirect: no neigh for target address\n"); goto release; } read_lock_bh(&neigh->lock); if (neigh->nud_state & NUD_VALID) { memcpy(ha_buf, neigh->ha, dev->addr_len); read_unlock_bh(&neigh->lock); ha = ha_buf; optlen += ndisc_redirect_opt_addr_space(dev, neigh, ops_data_buf, &ops_data); } else read_unlock_bh(&neigh->lock); neigh_release(neigh); } rd_len = min_t(unsigned int, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(*msg) - optlen, skb->len + 8); rd_len &= ~0x7; optlen += rd_len; buff = ndisc_alloc_skb(dev, sizeof(*msg) + optlen); if (!buff) goto release; msg = skb_put(buff, sizeof(*msg)); *msg = (struct rd_msg) { .icmph = { .icmp6_type = NDISC_REDIRECT, }, .target = *target, .dest = ipv6_hdr(skb)->daddr, }; /* * include target_address option */ if (ha) ndisc_fill_redirect_addr_option(buff, ha, ops_data); /* * build redirect option and copy skb over to the new packet. */ if (rd_len) ndisc_fill_redirect_hdr_option(buff, skb, rd_len); skb_dst_set(buff, dst); ndisc_send_skb(buff, &ipv6_hdr(skb)->saddr, &saddr_buf); return; release: dst_release(dst); } static void pndisc_redo(struct sk_buff *skb) { enum skb_drop_reason reason = ndisc_recv_ns(skb); kfree_skb_reason(skb, reason); } static int ndisc_is_multicast(const void *pkey) { return ipv6_addr_is_multicast((struct in6_addr *)pkey); } static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); if (!idev) return true; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED && READ_ONCE(idev->cnf.suppress_frag_ndisc)) { net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n"); return true; } return false; } enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) { struct nd_msg *msg; SKB_DR(reason); if (ndisc_suppress_frag_ndisc(skb)) return SKB_DROP_REASON_IPV6_NDISC_FRAG; if (skb_linearize(skb)) return SKB_DROP_REASON_NOMEM; msg = (struct nd_msg *)skb_transport_header(skb); __skb_push(skb, skb->data - skb_transport_header(skb)); if (ipv6_hdr(skb)->hop_limit != 255) { ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit); return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT; } if (msg->icmph.icmp6_code != 0) { ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code); return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE; } switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); reason = ndisc_recv_ns(skb); break; case NDISC_NEIGHBOUR_ADVERTISEMENT: reason = ndisc_recv_na(skb); break; case NDISC_ROUTER_SOLICITATION: reason = ndisc_recv_rs(skb); break; case NDISC_ROUTER_ADVERTISEMENT: reason = ndisc_router_discovery(skb); break; case NDISC_REDIRECT: reason = ndisc_redirect_rcv(skb); break; } return reason; } static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; struct net *net = dev_net(dev); struct inet6_dev *idev; bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&nd_tbl, dev); fib6_run_gc(0, net, false); fallthrough; case NETDEV_UP: idev = in6_dev_get(dev); if (!idev) break; if (READ_ONCE(idev->cnf.ndisc_notify) || READ_ONCE(net->ipv6.devconf_all->ndisc_notify)) ndisc_send_unsol_na(dev); in6_dev_put(idev); break; case NETDEV_CHANGE: idev = in6_dev_get(dev); if (!idev) evict_nocarrier = true; else { evict_nocarrier = READ_ONCE(idev->cnf.ndisc_evict_nocarrier) && READ_ONCE(net->ipv6.devconf_all->ndisc_evict_nocarrier); in6_dev_put(idev); } change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&nd_tbl, dev); if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&nd_tbl, dev); break; case NETDEV_DOWN: neigh_ifdown(&nd_tbl, dev); fib6_run_gc(0, net, false); break; case NETDEV_NOTIFY_PEERS: ndisc_send_unsol_na(dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block ndisc_netdev_notifier = { .notifier_call = ndisc_netdev_event, .priority = ADDRCONF_NOTIFY_PRIORITY - 5, }; #ifdef CONFIG_SYSCTL static void ndisc_warn_deprecated_sysctl(const struct ctl_table *ctl, const char *func, const char *dev_name) { static char warncomm[TASK_COMM_LEN]; static int warned; if (strcmp(warncomm, current->comm) && warned < 5) { strscpy(warncomm, current->comm); pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n", warncomm, func, dev_name, ctl->procname, dev_name, ctl->procname); warned++; } } int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; int ret; if ((strcmp(ctl->procname, "retrans_time") == 0) || (strcmp(ctl->procname, "base_reachable_time") == 0)) ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); if (strcmp(ctl->procname, "retrans_time") == 0) ret = neigh_proc_dointvec(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = neigh_proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) || (strcmp(ctl->procname, "base_reachable_time_ms") == 0)) ret = neigh_proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); else ret = -1; if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)) idev->nd_parms->reachable_time = neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)); WRITE_ONCE(idev->tstamp, jiffies); inet6_ifinfo_notify(RTM_NEWLINK, idev); in6_dev_put(idev); } return ret; } #endif static int __net_init ndisc_net_init(struct net *net) { struct ipv6_pinfo *np; struct sock *sk; int err; err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { ND_PRINTK(0, err, "NDISC: Failed to initialize the control socket (err %d)\n", err); return err; } net->ipv6.ndisc_sk = sk; np = inet6_sk(sk); np->hop_limit = 255; /* Do not loopback ndisc messages */ inet6_clear_bit(MC6_LOOP, sk); return 0; } static void __net_exit ndisc_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.ndisc_sk); } static struct pernet_operations ndisc_net_ops = { .init = ndisc_net_init, .exit = ndisc_net_exit, }; int __init ndisc_init(void) { int err; err = register_pernet_subsys(&ndisc_net_ops); if (err) return err; /* * Initialize the neighbour table */ neigh_table_init(NEIGH_ND_TABLE, &nd_tbl); #ifdef CONFIG_SYSCTL err = neigh_sysctl_register(NULL, &nd_tbl.parms, ndisc_ifinfo_sysctl_change); if (err) goto out_unregister_pernet; out: #endif return err; #ifdef CONFIG_SYSCTL out_unregister_pernet: unregister_pernet_subsys(&ndisc_net_ops); goto out; #endif } int __init ndisc_late_init(void) { return register_netdevice_notifier(&ndisc_netdev_notifier); } void ndisc_late_cleanup(void) { unregister_netdevice_notifier(&ndisc_netdev_notifier); } void ndisc_cleanup(void) { #ifdef CONFIG_SYSCTL neigh_sysctl_unregister(&nd_tbl.parms); #endif neigh_table_clear(NEIGH_ND_TABLE, &nd_tbl); unregister_pernet_subsys(&ndisc_net_ops); } |
11 3 2 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> #include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; struct net *net = xt_net(par); struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; if (nf_ip6_checksum(skb, xt_hooknum(par), par->thoff, IPPROTO_TCP)) return NF_DROP; th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th); if (th == NULL) return NF_DROP; if (!synproxy_parse_options(skb, par->thoff, th, &opts)) return NF_DROP; if (th->syn && !(th->ack || th->fin || th->rst)) { /* Initial SYN from client */ this_cpu_inc(snet->stats->syn_received); if (th->ece && th->cwr) opts.options |= XT_SYNPROXY_OPT_ECN; opts.options &= info->options; opts.mss_encode = opts.mss_option; opts.mss_option = info->mss; if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, &opts); else opts.options &= ~(XT_SYNPROXY_OPT_WSCALE | XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); synproxy_send_client_synack_ipv6(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { return NF_DROP; } } return XT_CONTINUE; } static int synproxy_tg6_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); const struct ip6t_entry *e = par->entryinfo; int err; if (!(e->ipv6.flags & IP6T_F_PROTO) || e->ipv6.proto != IPPROTO_TCP || e->ipv6.invflags & XT_INV_PROTO) return -EINVAL; err = nf_ct_netns_get(par->net, par->family); if (err) return err; err = nf_synproxy_ipv6_init(snet, par->net); if (err) { nf_ct_netns_put(par->net, par->family); return err; } return err; } static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); nf_synproxy_ipv6_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } static struct xt_target synproxy_tg6_reg __read_mostly = { .name = "SYNPROXY", .family = NFPROTO_IPV6, .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD), .target = synproxy_tg6, .targetsize = sizeof(struct xt_synproxy_info), .checkentry = synproxy_tg6_check, .destroy = synproxy_tg6_destroy, .me = THIS_MODULE, }; static int __init synproxy_tg6_init(void) { return xt_register_target(&synproxy_tg6_reg); } static void __exit synproxy_tg6_exit(void) { xt_unregister_target(&synproxy_tg6_reg); } module_init(synproxy_tg6_init); module_exit(synproxy_tg6_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Intercept IPv6 TCP connections and establish them using syncookies"); |
2 1 1 40 40 40 23 49 49 2 47 43 18 2 39 23 39 39 36 2 39 23 17 23 17 39 23 17 39 24 17 3 9 9 3 2 3 1 3 3 3 3 1 4 2 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPV4 GSO/GRO offload support * Linux INET implementation * * TCPv4 GSO/GRO support */ #include <linux/indirect_call_wrapper.h> #include <linux/skbuff.h> #include <net/gro.h> #include <net/gso.h> #include <net/tcp.h> #include <net/protocol.h> static void tcp_gso_tstamp(struct sk_buff *skb, struct sk_buff *gso_skb, unsigned int seq, unsigned int mss) { u32 flags = skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP; u32 ts_seq = skb_shinfo(gso_skb)->tskey; while (skb) { if (before(ts_seq, seq + mss)) { skb_shinfo(skb)->tx_flags |= flags; skb_shinfo(skb)->tskey = ts_seq; return; } skb = skb->next; seq += mss; } } static void __tcpv4_gso_segment_csum(struct sk_buff *seg, __be32 *oldip, __be32 newip, __be16 *oldport, __be16 newport) { struct tcphdr *th; struct iphdr *iph; if (*oldip == newip && *oldport == newport) return; th = tcp_hdr(seg); iph = ip_hdr(seg); inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true); inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); *oldport = newport; csum_replace4(&iph->check, *oldip, newip); *oldip = newip; } static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs) { const struct tcphdr *th; const struct iphdr *iph; struct sk_buff *seg; struct tcphdr *th2; struct iphdr *iph2; seg = segs; th = tcp_hdr(seg); iph = ip_hdr(seg); th2 = tcp_hdr(seg->next); iph2 = ip_hdr(seg->next); if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) && iph->daddr == iph2->daddr && iph->saddr == iph2->saddr) return segs; while ((seg = seg->next)) { th2 = tcp_hdr(seg); iph2 = ip_hdr(seg); __tcpv4_gso_segment_csum(seg, &iph2->saddr, iph->saddr, &th2->source, th->source); __tcpv4_gso_segment_csum(seg, &iph2->daddr, iph->daddr, &th2->dest, th->dest); } return segs; } static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb, netdev_features_t features) { skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); if (IS_ERR(skb)) return skb; return __tcpv4_gso_segment_list_csum(skb); } static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)) return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) return ERR_PTR(-EINVAL); if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) { struct tcphdr *th = tcp_hdr(skb); if (skb_pagelen(skb) - th->doff * 4 == skb_shinfo(skb)->gso_size) return __tcp4_gso_segment_list(skb, features); skb->ip_summed = CHECKSUM_NONE; } if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); /* Set up checksum pseudo header, usually expect stack to * have done this already. */ th->check = 0; skb->ip_summed = CHECKSUM_PARTIAL; __tcp_v4_send_check(skb, iph->saddr, iph->daddr); } return tcp_gso_segment(skb, features); } struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int sum_truesize = 0; struct tcphdr *th; unsigned int thlen; unsigned int seq; unsigned int oldlen; unsigned int mss; struct sk_buff *gso_skb = skb; __sum16 newcheck; bool ooo_okay, copy_destructor; bool ecn_cwr_mask; __wsum delta; th = tcp_hdr(skb); thlen = th->doff * 4; if (thlen < sizeof(*th)) goto out; if (unlikely(skb_checksum_start(skb) != skb_transport_header(skb))) goto out; if (!pskb_may_pull(skb, thlen)) goto out; oldlen = ~skb->len; __skb_pull(skb, thlen); mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); segs = NULL; goto out; } copy_destructor = gso_skb->destructor == tcp_wfree; ooo_okay = gso_skb->ooo_okay; /* All segments but the first should have ooo_okay cleared */ skb->ooo_okay = 0; segs = skb_segment(skb, features); if (IS_ERR(segs)) goto out; /* Only first segment might have ooo_okay set */ segs->ooo_okay = ooo_okay; /* GSO partial and frag_list segmentation only requires splitting * the frame into an MSS multiple and possibly a remainder, both * cases return a GSO skb. So update the mss now. */ if (skb_is_gso(segs)) mss *= skb_shinfo(segs)->gso_segs; delta = (__force __wsum)htonl(oldlen + thlen + mss); skb = segs; th = tcp_hdr(skb); seq = ntohl(th->seq); if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP)) tcp_gso_tstamp(segs, gso_skb, seq, mss); newcheck = ~csum_fold(csum_add(csum_unfold(th->check), delta)); ecn_cwr_mask = !!(skb_shinfo(gso_skb)->gso_type & SKB_GSO_TCP_ACCECN); while (skb->next) { th->fin = th->psh = 0; th->check = newcheck; if (skb->ip_summed == CHECKSUM_PARTIAL) gso_reset_checksum(skb, ~th->check); else th->check = gso_make_checksum(skb, ~th->check); seq += mss; if (copy_destructor) { skb->destructor = gso_skb->destructor; skb->sk = gso_skb->sk; sum_truesize += skb->truesize; } skb = skb->next; th = tcp_hdr(skb); th->seq = htonl(seq); th->cwr &= ecn_cwr_mask; } /* Following permits TCP Small Queues to work well with GSO : * The callback to TCP stack will be called at the time last frag * is freed at TX completion, and not right now when gso_skb * is freed by GSO engine */ if (copy_destructor) { int delta; swap(gso_skb->sk, skb->sk); swap(gso_skb->destructor, skb->destructor); sum_truesize += skb->truesize; delta = sum_truesize - gso_skb->truesize; /* In some pathological cases, delta can be negative. * We need to either use refcount_add() or refcount_sub_and_test() */ if (likely(delta >= 0)) refcount_add(delta, &skb->sk->sk_wmem_alloc); else WARN_ON_ONCE(refcount_sub_and_test(-delta, &skb->sk->sk_wmem_alloc)); } delta = (__force __wsum)htonl(oldlen + (skb_tail_pointer(skb) - skb_transport_header(skb)) + skb->data_len); th->check = ~csum_fold(csum_add(csum_unfold(th->check), delta)); if (skb->ip_summed == CHECKSUM_PARTIAL) gso_reset_checksum(skb, ~th->check); else th->check = gso_make_checksum(skb, ~th->check); out: return segs; } struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th) { struct tcphdr *th2; struct sk_buff *p; list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; th2 = tcp_hdr(p); if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { NAPI_GRO_CB(p)->same_flow = 0; continue; } return p; } return NULL; } struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb) { unsigned int thlen, hlen, off; struct tcphdr *th; off = skb_gro_offset(skb); hlen = off + sizeof(*th); th = skb_gro_header(skb, hlen, off); if (unlikely(!th)) return NULL; thlen = th->doff * 4; if (thlen < sizeof(*th)) return NULL; hlen = off + thlen; if (!skb_gro_may_pull(skb, hlen)) { th = skb_gro_header_slow(skb, hlen, off); if (unlikely(!th)) return NULL; } skb_gro_pull(skb, thlen); return th; } struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, struct tcphdr *th) { unsigned int thlen = th->doff * 4; struct sk_buff *pp = NULL; struct sk_buff *p; struct tcphdr *th2; unsigned int len; __be32 flags; unsigned int mss = 1; int flush = 1; int i; len = skb_gro_len(skb); flags = tcp_flag_word(th); p = tcp_gro_lookup(head, th); if (!p) goto out_check_final; th2 = tcp_hdr(p); flush = (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_FIN | TCP_FLAG_PSH)); flush |= (__force int)(th->ack_seq ^ th2->ack_seq); for (i = sizeof(*th); i < thlen; i += 4) flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); flush |= gro_receive_network_flush(th, th2, p); mss = skb_shinfo(p)->gso_size; /* If skb is a GRO packet, make sure its gso_size matches prior packet mss. * If it is a single frame, do not aggregate it if its length * is bigger than our mss. */ if (unlikely(skb_is_gso(skb))) flush |= (mss != skb_shinfo(skb)->gso_size); else flush |= (len - 1) >= mss; flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); flush |= skb_cmp_decrypted(p, skb); if (unlikely(NAPI_GRO_CB(p)->is_flist)) { flush |= (__force int)(flags ^ tcp_flag_word(th2)); flush |= skb->ip_summed != p->ip_summed; flush |= skb->csum_level != p->csum_level; flush |= NAPI_GRO_CB(p)->count >= 64; if (flush || skb_gro_receive_list(p, skb)) mss = 1; goto out_check_final; } if (flush || skb_gro_receive(p, skb)) { mss = 1; goto out_check_final; } tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); out_check_final: /* Force a flush if last segment is smaller than mss. */ if (unlikely(skb_is_gso(skb))) flush = len != NAPI_GRO_CB(skb)->count * skb_shinfo(skb)->gso_size; else flush = len < mss; flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | TCP_FLAG_SYN | TCP_FLAG_FIN)); if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) pp = p; NAPI_GRO_CB(skb)->flush |= (flush != 0); return pp; } void tcp_gro_complete(struct sk_buff *skb) { struct tcphdr *th = tcp_hdr(skb); struct skb_shared_info *shinfo; if (skb->encapsulation) skb->inner_transport_header = skb->transport_header; skb->csum_start = (unsigned char *)th - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; shinfo = skb_shinfo(skb); shinfo->gso_segs = NAPI_GRO_CB(skb)->count; if (th->cwr) shinfo->gso_type |= SKB_GSO_TCP_ACCECN; } EXPORT_SYMBOL(tcp_gro_complete); static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, struct tcphdr *th) { const struct iphdr *iph; struct sk_buff *p; struct sock *sk; struct net *net; int iif, sdif; if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST))) return; p = tcp_gro_lookup(head, th); if (p) { NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist; return; } inet_get_iif_sdif(skb, &iif, &sdif); iph = skb_gro_network_header(skb); net = dev_net_rcu(skb->dev); sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; if (sk) sock_gen_put(sk); } INDIRECT_CALLABLE_SCOPE struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct tcphdr *th; /* Don't bother verifying checksum if we're going to flush anyway. */ if (!NAPI_GRO_CB(skb)->flush && skb_gro_checksum_validate(skb, IPPROTO_TCP, inet_gro_compute_pseudo)) goto flush; th = tcp_gro_pull_header(skb); if (!th) goto flush; tcp4_check_fraglist_gro(head, skb, th); return tcp_gro_receive(head, skb, th); flush: NAPI_GRO_CB(skb)->flush = 1; return NULL; } INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4; skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; __skb_incr_checksum_unnecessary(skb); return 0; } th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 | (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); tcp_gro_complete(skb); return 0; } int __init tcpv4_offload_init(void) { net_hotdata.tcpv4_offload = (struct net_offload) { .callbacks = { .gso_segment = tcp4_gso_segment, .gro_receive = tcp4_gro_receive, .gro_complete = tcp4_gro_complete, }, }; return inet_add_offload(&net_hotdata.tcpv4_offload, IPPROTO_TCP); } |
26987 27480 7976 4146 4150 2159 26600 9484 463 538 385 463 121 121 6 52 36 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_FIND_H_ #define __LINUX_FIND_H_ #ifndef __LINUX_BITMAP_H #error only <linux/bitmap.h> can be included directly #endif #include <linux/bitops.h> unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits, unsigned long start); unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n); unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n); extern unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); #ifdef __BIG_ENDIAN unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size); unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); unsigned long _find_next_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); #endif #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit(addr, size, offset); } #endif #ifndef find_next_and_bit /** * find_next_and_bit - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & *addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_and_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_andnot_bit /** * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits * in *addr2 * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & ~*addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_andnot_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_or_bit /** * find_next_or_bit - find the next set bit in either memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = (*addr1 | *addr2) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_or_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_zero_bit /** * find_next_zero_bit - find the next cleared bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ static __always_inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit(addr, size, offset); } #endif #ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_bit(addr, size); } #endif /** * find_nth_bit - find N'th set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * The following is semantically equivalent: * idx = find_nth_bit(addr, size, 0); * idx = find_first_bit(addr, size); * * Returns the bit number of the N'th set bit. * If no such, returns >= @size. */ static __always_inline unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_bit(addr, size, n); } /** * find_nth_and_bit - find N'th set bit in 2 memory regions * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_bit(addr1, addr2, size, n); } /** * find_nth_andnot_bit - find N'th set bit in 2 memory regions, * flipping bits in 2nd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_andnot_bit(addr1, addr2, size, n); } /** * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions, * excluding those set in 3rd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @addr3: The 3rd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n); } #ifndef find_first_and_bit /** * find_first_and_bit - find the first set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_bit(addr1, addr2, size); } #endif /** * find_first_and_and_bit - find the first set bit in 3 memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @addr3: The third address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the first set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_and_bit(addr1, addr2, addr3, size); } #ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ static __always_inline unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit(addr, size); } #endif #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region * @addr: The address to start the search at * @size: The number of bits to search * * Returns the bit number of the last set bit, or size. */ static __always_inline unsigned long find_last_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __fls(val) : size; } return _find_last_bit(addr, size); } #endif /** * find_next_and_bit_wrap - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit_wrap(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { unsigned long bit = find_next_and_bit(addr1, addr2, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_and_bit(addr1, addr2, offset); return bit < offset ? bit : size; } /** * find_next_bit_wrap - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset) { unsigned long bit = find_next_bit(addr, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_bit(addr, offset); return bit < offset ? bit : size; } /* * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing * before using it alone. */ static __always_inline unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, unsigned long start, unsigned long n) { unsigned long bit; /* If not wrapped around */ if (n > start) { /* and have a bit, just return it. */ bit = find_next_bit(bitmap, size, n); if (bit < size) return bit; /* Otherwise, wrap around and ... */ n = 0; } /* Search the other part. */ bit = find_next_bit(bitmap, start, n); return bit < start ? bit : size; } /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump * @addr: address to base the search on * @size: bitmap size in number of bits * @offset: bit offset at which to start searching * * Returns the bit offset for the next set clump; the found clump value is * copied to the location pointed by @clump. If no bits are set, returns @size. */ extern unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, unsigned long size, unsigned long offset); #define find_first_clump8(clump, bits, size) \ find_next_clump8((clump), (bits), (size), 0) #if defined(__LITTLE_ENDIAN) static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_zero_bit(addr, size, offset); } static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_bit(addr, size, offset); } static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { return find_first_zero_bit(addr, size); } #elif defined(__BIG_ENDIAN) #ifndef find_next_zero_bit_le static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit_le(addr, size, offset); } #endif #ifndef find_first_zero_bit_le static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit_le(addr, size); } #endif #ifndef find_next_bit_le static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit_le(addr, size, offset); } #endif #else #error "Please fix <asm/byteorder.h>" #endif #define for_each_set_bit(bit, addr, size) \ for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_and_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_andnot_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_or_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) /* same as for_each_set_bit() but use bit as value to start with */ #define for_each_set_bit_from(bit, addr, size) \ for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_clear_bit(bit, addr, size) \ for ((bit) = 0; \ (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); \ (bit)++) /* same as for_each_clear_bit() but use bit as value to start with */ #define for_each_clear_bit_from(bit, addr, size) \ for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++) /** * for_each_set_bitrange - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit) * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_bit((addr), (size), b), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_bit((addr), (size), (b)), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first unset bit) * @e: bit offset of end of current bitrange (first set bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bit_wrap - iterate over all set bits starting from @start, and * wrapping around the end of bitmap. * @bit: offset for current iteration * @addr: bitmap address to base the search on * @size: bitmap size in number of bits * @start: Starting bit for bitmap traversing, wrapping around the bitmap end */ #define for_each_set_bit_wrap(bit, addr, size, start) \ for ((bit) = find_next_bit_wrap((addr), (size), (start)); \ (bit) < (size); \ (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1)) /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset * @clump: location to store copy of current 8-bit clump * @bits: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_clump8(start, clump, bits, size) \ for ((start) = find_first_clump8(&(clump), (bits), (size)); \ (start) < (size); \ (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) #endif /*__LINUX_FIND_H_ */ |
50 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Descending-priority-sorted double-linked list * * (C) 2002-2003 Intel Corp * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>. * * 2001-2005 (c) MontaVista Software, Inc. * Daniel Walker <dwalker@mvista.com> * * (C) 2005 Thomas Gleixner <tglx@linutronix.de> * * Simplifications of the original code by * Oleg Nesterov <oleg@tv-sign.ru> * * Based on simple lists (include/linux/list.h). * * This is a priority-sorted list of nodes; each node has a * priority from INT_MIN (highest) to INT_MAX (lowest). * * Addition is O(K), removal is O(1), change of priority of a node is * O(K) and K is the number of RT priority levels used in the system. * (1 <= K <= 99) * * This list is really a list of lists: * * - The tier 1 list is the prio_list, different priority nodes. * * - The tier 2 list is the node_list, serialized nodes. * * Simple ASCII art explanation: * * pl:prio_list (only for plist_node) * nl:node_list * HEAD| NODE(S) * | * ||------------------------------------| * ||->|pl|<->|pl|<--------------->|pl|<-| * | |10| |21| |21| |21| |40| (prio) * | | | | | | | | | | | * | | | | | | | | | | | * |->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<->|nl|<-| * |-------------------------------------------| * * The nodes on the prio_list list are sorted by priority to simplify * the insertion of new nodes. There are no nodes with duplicate * priorites on the list. * * The nodes on the node_list are ordered by priority and can contain * entries which have the same priority. Those entries are ordered * FIFO * * Addition means: look for the prio_list node in the prio_list * for the priority of the node and insert it before the node_list * entry of the next prio_list node. If it is the first node of * that priority, add it to the prio_list in the right position and * insert it into the serialized node_list list * * Removal means remove it from the node_list and remove it from * the prio_list if the node_list list_head is non empty. In case * of removal from the prio_list it must be checked whether other * entries of the same priority are on the list or not. If there * is another entry of the same priority then this entry has to * replace the removed entry on the prio_list. If the entry which * is removed is the only entry of this priority then a simple * remove from both list is sufficient. * * INT_MIN is the highest priority, 0 is the medium highest, INT_MAX * is lowest priority. * * No locking is done, up to the caller. */ #ifndef _LINUX_PLIST_H_ #define _LINUX_PLIST_H_ #include <linux/container_of.h> #include <linux/list.h> #include <linux/plist_types.h> #include <asm/bug.h> /** * PLIST_HEAD_INIT - static struct plist_head initializer * @head: struct plist_head variable name */ #define PLIST_HEAD_INIT(head) \ { \ .node_list = LIST_HEAD_INIT((head).node_list) \ } /** * PLIST_HEAD - declare and init plist_head * @head: name for struct plist_head variable */ #define PLIST_HEAD(head) \ struct plist_head head = PLIST_HEAD_INIT(head) /** * PLIST_NODE_INIT - static struct plist_node initializer * @node: struct plist_node variable name * @__prio: initial node priority */ #define PLIST_NODE_INIT(node, __prio) \ { \ .prio = (__prio), \ .prio_list = LIST_HEAD_INIT((node).prio_list), \ .node_list = LIST_HEAD_INIT((node).node_list), \ } /** * plist_head_init - dynamic struct plist_head initializer * @head: &struct plist_head pointer */ static inline void plist_head_init(struct plist_head *head) { INIT_LIST_HEAD(&head->node_list); } /** * plist_node_init - Dynamic struct plist_node initializer * @node: &struct plist_node pointer * @prio: initial node priority */ static inline void plist_node_init(struct plist_node *node, int prio) { node->prio = prio; INIT_LIST_HEAD(&node->prio_list); INIT_LIST_HEAD(&node->node_list); } extern void plist_add(struct plist_node *node, struct plist_head *head); extern void plist_del(struct plist_node *node, struct plist_head *head); extern void plist_requeue(struct plist_node *node, struct plist_head *head); /** * plist_for_each - iterate over the plist * @pos: the type * to use as a loop counter * @head: the head for your list */ #define plist_for_each(pos, head) \ list_for_each_entry(pos, &(head)->node_list, node_list) /** * plist_for_each_continue - continue iteration over the plist * @pos: the type * to use as a loop cursor * @head: the head for your list * * Continue to iterate over plist, continuing after the current position. */ #define plist_for_each_continue(pos, head) \ list_for_each_entry_continue(pos, &(head)->node_list, node_list) /** * plist_for_each_safe - iterate safely over a plist of given type * @pos: the type * to use as a loop counter * @n: another type * to use as temporary storage * @head: the head for your list * * Iterate over a plist of given type, safe against removal of list entry. */ #define plist_for_each_safe(pos, n, head) \ list_for_each_entry_safe(pos, n, &(head)->node_list, node_list) /** * plist_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop counter * @head: the head for your list * @mem: the name of the list_head within the struct */ #define plist_for_each_entry(pos, head, mem) \ list_for_each_entry(pos, &(head)->node_list, mem.node_list) /** * plist_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor * @head: the head for your list * @m: the name of the list_head within the struct * * Continue to iterate over list of given type, continuing after * the current position. */ #define plist_for_each_entry_continue(pos, head, m) \ list_for_each_entry_continue(pos, &(head)->node_list, m.node_list) /** * plist_for_each_entry_safe - iterate safely over list of given type * @pos: the type * to use as a loop counter * @n: another type * to use as temporary storage * @head: the head for your list * @m: the name of the list_head within the struct * * Iterate over list of given type, safe against removal of list entry. */ #define plist_for_each_entry_safe(pos, n, head, m) \ list_for_each_entry_safe(pos, n, &(head)->node_list, m.node_list) /** * plist_head_empty - return !0 if a plist_head is empty * @head: &struct plist_head pointer */ static inline int plist_head_empty(const struct plist_head *head) { return list_empty(&head->node_list); } /** * plist_node_empty - return !0 if plist_node is not on a list * @node: &struct plist_node pointer */ static inline int plist_node_empty(const struct plist_node *node) { return list_empty(&node->node_list); } /* All functions below assume the plist_head is not empty. */ /** * plist_first_entry - get the struct for the first entry * @head: the &struct plist_head pointer * @type: the type of the struct this is embedded in * @member: the name of the list_head within the struct */ #ifdef CONFIG_DEBUG_PLIST # define plist_first_entry(head, type, member) \ ({ \ WARN_ON(plist_head_empty(head)); \ container_of(plist_first(head), type, member); \ }) #else # define plist_first_entry(head, type, member) \ container_of(plist_first(head), type, member) #endif /** * plist_last_entry - get the struct for the last entry * @head: the &struct plist_head pointer * @type: the type of the struct this is embedded in * @member: the name of the list_head within the struct */ #ifdef CONFIG_DEBUG_PLIST # define plist_last_entry(head, type, member) \ ({ \ WARN_ON(plist_head_empty(head)); \ container_of(plist_last(head), type, member); \ }) #else # define plist_last_entry(head, type, member) \ container_of(plist_last(head), type, member) #endif /** * plist_next - get the next entry in list * @pos: the type * to cursor */ #define plist_next(pos) \ list_next_entry(pos, node_list) /** * plist_prev - get the prev entry in list * @pos: the type * to cursor */ #define plist_prev(pos) \ list_prev_entry(pos, node_list) /** * plist_first - return the first node (and thus, highest priority) * @head: the &struct plist_head pointer * * Assumes the plist is _not_ empty. */ static inline struct plist_node *plist_first(const struct plist_head *head) { return list_entry(head->node_list.next, struct plist_node, node_list); } /** * plist_last - return the last node (and thus, lowest priority) * @head: the &struct plist_head pointer * * Assumes the plist is _not_ empty. */ static inline struct plist_node *plist_last(const struct plist_head *head) { return list_entry(head->node_list.prev, struct plist_node, node_list); } #endif |
1 1 10 12 12 12 12 12 45 45 2 43 12 9 24 3 14 7 12 12 7 5 12 12 1 11 12 2 10 12 12 5 7 12 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 | // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC recvmsg() implementation * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/net.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" /* * Post a call for attention by the socket or kernel service. Further * notifications are suppressed by putting recvmsg_link on a dummy queue. */ void rxrpc_notify_socket(struct rxrpc_call *call) { struct rxrpc_sock *rx; struct sock *sk; _enter("%d", call->debug_id); if (!list_empty(&call->recvmsg_link)) return; rcu_read_lock(); rx = rcu_dereference(call->socket); sk = &rx->sk; if (rx && sk->sk_state < RXRPC_CLOSE) { if (call->notify_rx) { spin_lock_irq(&call->notify_lock); call->notify_rx(sk, call, call->user_call_ID); spin_unlock_irq(&call->notify_lock); } else { spin_lock_irq(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { rxrpc_get_call(call, rxrpc_call_get_notify_socket); list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); } spin_unlock_irq(&rx->recvmsg_lock); if (!sock_flag(sk, SOCK_DEAD)) { _debug("call %ps", sk->sk_data_ready); sk->sk_data_ready(sk); } } } rcu_read_unlock(); _leave(""); } /* * Pass a call terminating message to userspace. */ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) { u32 tmp = 0; int ret; switch (call->completion) { case RXRPC_CALL_SUCCEEDED: ret = 0; if (rxrpc_is_service_call(call)) ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &tmp); break; case RXRPC_CALL_REMOTELY_ABORTED: tmp = call->abort_code; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp); break; case RXRPC_CALL_LOCALLY_ABORTED: tmp = call->abort_code; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp); break; case RXRPC_CALL_NETWORK_ERROR: tmp = -call->error; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &tmp); break; case RXRPC_CALL_LOCAL_ERROR: tmp = -call->error; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, &tmp); break; default: pr_err("Invalid terminal call state %u\n", call->completion); BUG(); break; } trace_rxrpc_recvdata(call, rxrpc_recvmsg_terminal, call->ackr_window - 1, call->rx_pkt_offset, call->rx_pkt_len, ret); return ret; } /* * Discard a packet we've used up and advance the Rx window by one. */ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_serial_t serial; rxrpc_seq_t old_consumed = call->rx_consumed, tseq; bool last; int acked; _enter("%d", call->debug_id); skb = skb_dequeue(&call->recvmsg_queue); rxrpc_see_skb(skb, rxrpc_skb_see_rotate); sp = rxrpc_skb(skb); tseq = sp->hdr.seq; serial = sp->hdr.serial; last = sp->hdr.flags & RXRPC_LAST_PACKET; /* Barrier against rxrpc_input_data(). */ if (after(tseq, call->rx_consumed)) smp_store_release(&call->rx_consumed, tseq); rxrpc_free_skb(skb, rxrpc_skb_put_rotate); trace_rxrpc_receive(call, last ? rxrpc_receive_rotate_last : rxrpc_receive_rotate, serial, call->rx_consumed); if (last) set_bit(RXRPC_CALL_RECVMSG_READ_ALL, &call->flags); /* Check to see if there's an ACK that needs sending. */ acked = atomic_add_return(call->rx_consumed - old_consumed, &call->ackr_nr_consumed); if (acked > 8 && !test_and_set_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) rxrpc_poke_call(call, rxrpc_call_poke_idle); } /* * Decrypt and verify a DATA packet. */ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); if (sp->flags & RXRPC_RX_VERIFIED) return 0; return call->security->verify_packet(call, skb); } /* * Deliver messages to a call. This keeps processing packets until the buffer * is filled and we find either more DATA (returns 0) or the end of the DATA * (returns 1). If more packets are required, it returns -EAGAIN and if the * call has failed it returns -EIO. */ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, struct iov_iter *iter, size_t len, int flags, size_t *_offset) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_seq_t seq = 0; size_t remain; unsigned int rx_pkt_offset, rx_pkt_len; int copy, ret = -EAGAIN, ret2; rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; if (rxrpc_call_has_failed(call)) { seq = call->ackr_window - 1; ret = -EIO; goto done; } if (test_bit(RXRPC_CALL_RECVMSG_READ_ALL, &call->flags)) { seq = call->ackr_window - 1; ret = 1; goto done; } /* No one else can be removing stuff from the queue, so we shouldn't * need the Rx lock to walk it. */ skb = skb_peek(&call->recvmsg_queue); while (skb) { rxrpc_see_skb(skb, rxrpc_skb_see_recvmsg); sp = rxrpc_skb(skb); seq = sp->hdr.seq; if (!(flags & MSG_PEEK)) trace_rxrpc_receive(call, rxrpc_receive_front, sp->hdr.serial, seq); if (msg) sock_recv_timestamp(msg, sock->sk, skb); if (rx_pkt_offset == 0) { ret2 = rxrpc_verify_data(call, skb); trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, sp->offset, sp->len, ret2); if (ret2 < 0) { kdebug("verify = %d", ret2); ret = ret2; goto out; } rx_pkt_offset = sp->offset; rx_pkt_len = sp->len; } else { trace_rxrpc_recvdata(call, rxrpc_recvmsg_cont, seq, rx_pkt_offset, rx_pkt_len, 0); } /* We have to handle short, empty and used-up DATA packets. */ remain = len - *_offset; copy = rx_pkt_len; if (copy > remain) copy = remain; if (copy > 0) { ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter, copy); if (ret2 < 0) { ret = ret2; goto out; } /* handle piecemeal consumption of data packets */ rx_pkt_offset += copy; rx_pkt_len -= copy; *_offset += copy; } if (rx_pkt_len > 0) { trace_rxrpc_recvdata(call, rxrpc_recvmsg_full, seq, rx_pkt_offset, rx_pkt_len, 0); ASSERTCMP(*_offset, ==, len); ret = 0; break; } /* The whole packet has been transferred. */ if (sp->hdr.flags & RXRPC_LAST_PACKET) ret = 1; rx_pkt_offset = 0; rx_pkt_len = 0; skb = skb_peek_next(skb, &call->recvmsg_queue); if (!(flags & MSG_PEEK)) rxrpc_rotate_rx_window(call); } out: if (!(flags & MSG_PEEK)) { call->rx_pkt_offset = rx_pkt_offset; call->rx_pkt_len = rx_pkt_len; } done: trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq, rx_pkt_offset, rx_pkt_len, ret); if (ret == -EAGAIN) set_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); return ret; } /* * Receive a message from an RxRPC socket * - we need to be careful about two or more threads calling recvmsg * simultaneously */ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct list_head *l; unsigned int call_debug_id = 0; size_t copied = 0; long timeo; int ret; DEFINE_WAIT(wait); trace_rxrpc_recvmsg(0, rxrpc_recvmsg_enter, 0); if (flags & (MSG_OOB | MSG_TRUNC)) return -EOPNOTSUPP; timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT); try_again: lock_sock(&rx->sk); /* Return immediately if a client socket has no outstanding calls */ if (RB_EMPTY_ROOT(&rx->calls) && list_empty(&rx->recvmsg_q) && rx->sk.sk_state != RXRPC_SERVER_LISTENING) { release_sock(&rx->sk); return -EAGAIN; } if (list_empty(&rx->recvmsg_q)) { ret = -EWOULDBLOCK; if (timeo == 0) { call = NULL; goto error_no_call; } release_sock(&rx->sk); /* Wait for something to happen */ prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait, TASK_INTERRUPTIBLE); ret = sock_error(&rx->sk); if (ret) goto wait_error; if (list_empty(&rx->recvmsg_q)) { if (signal_pending(current)) goto wait_interrupted; trace_rxrpc_recvmsg(0, rxrpc_recvmsg_wait, 0); timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(&rx->sk), &wait); goto try_again; } /* Find the next call and dequeue it if we're not just peeking. If we * do dequeue it, that comes with a ref that we will need to release. * We also want to weed out calls that got requeued whilst we were * shovelling data out. */ spin_lock_irq(&rx->recvmsg_lock); l = rx->recvmsg_q.next; call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!rxrpc_call_is_complete(call) && skb_queue_empty(&call->recvmsg_queue)) { list_del_init(&call->recvmsg_link); spin_unlock_irq(&rx->recvmsg_lock); release_sock(&rx->sk); trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0); rxrpc_put_call(call, rxrpc_call_put_recvmsg); goto try_again; } if (!(flags & MSG_PEEK)) list_del_init(&call->recvmsg_link); else rxrpc_get_call(call, rxrpc_call_get_recvmsg); spin_unlock_irq(&rx->recvmsg_lock); call_debug_id = call->debug_id; trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0); /* We're going to drop the socket lock, so we need to lock the call * against interference by sendmsg. */ if (!mutex_trylock(&call->user_mutex)) { ret = -EWOULDBLOCK; if (flags & MSG_DONTWAIT) goto error_requeue_call; ret = -ERESTARTSYS; if (mutex_lock_interruptible(&call->user_mutex) < 0) goto error_requeue_call; } release_sock(&rx->sk); if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { if (flags & MSG_CMSG_COMPAT) { unsigned int id32 = call->user_call_ID; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, sizeof(unsigned int), &id32); } else { unsigned long idl = call->user_call_ID; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, sizeof(unsigned long), &idl); } if (ret < 0) goto error_unlock_call; } if (msg->msg_name && call->peer) { size_t len = sizeof(call->dest_srx); memcpy(msg->msg_name, &call->dest_srx, len); msg->msg_namelen = len; } ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len, flags, &copied); if (ret == -EAGAIN) ret = 0; if (ret == -EIO) goto call_failed; if (ret < 0) goto error_unlock_call; if (rxrpc_call_is_complete(call) && skb_queue_empty(&call->recvmsg_queue)) goto call_complete; if (rxrpc_call_has_failed(call)) goto call_failed; if (!skb_queue_empty(&call->recvmsg_queue)) rxrpc_notify_socket(call); goto not_yet_complete; call_failed: rxrpc_purge_queue(&call->recvmsg_queue); call_complete: ret = rxrpc_recvmsg_term(call, msg); if (ret < 0) goto error_unlock_call; if (!(flags & MSG_PEEK)) rxrpc_release_call(rx, call); msg->msg_flags |= MSG_EOR; ret = 1; not_yet_complete: if (ret == 0) msg->msg_flags |= MSG_MORE; else msg->msg_flags &= ~MSG_MORE; ret = copied; error_unlock_call: mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put_recvmsg); trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_return, ret); return ret; error_requeue_call: if (!(flags & MSG_PEEK)) { spin_lock_irq(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); spin_unlock_irq(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); } else { rxrpc_put_call(call, rxrpc_call_put_recvmsg); } error_no_call: release_sock(&rx->sk); error_trace: trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_return, ret); return ret; wait_interrupted: ret = sock_intr_errno(timeo); wait_error: finish_wait(sk_sleep(&rx->sk), &wait); call = NULL; goto error_trace; } /** * rxrpc_kernel_recv_data - Allow a kernel service to receive data/info * @sock: The socket that the call exists on * @call: The call to send data through * @iter: The buffer to receive into * @_len: The amount of data we want to receive (decreased on return) * @want_more: True if more data is expected to be read * @_abort: Where the abort code is stored if -ECONNABORTED is returned * @_service: Where to store the actual service ID (may be upgraded) * * Allow a kernel service to receive data and pick up information about the * state of a call. Returns 0 if got what was asked for and there's more * available, 1 if we got what was asked for and we're at the end of the data * and -EAGAIN if we need more data. * * Note that we may return -EAGAIN to drain empty packets at the end of the * data, even if we've already copied over the requested data. * * *_abort should also be initialised to 0. */ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, struct iov_iter *iter, size_t *_len, bool want_more, u32 *_abort, u16 *_service) { size_t offset = 0; int ret; _enter("{%d},%zu,%d", call->debug_id, *_len, want_more); mutex_lock(&call->user_mutex); ret = rxrpc_recvmsg_data(sock, call, NULL, iter, *_len, 0, &offset); *_len -= offset; if (ret == -EIO) goto call_failed; if (ret < 0) goto out; /* We can only reach here with a partially full buffer if we have * reached the end of the data. We must otherwise have a full buffer * or have been given -EAGAIN. */ if (ret == 1) { if (iov_iter_count(iter) > 0) goto short_data; if (!want_more) goto read_phase_complete; ret = 0; goto out; } if (!want_more) goto excess_data; goto out; read_phase_complete: ret = 1; out: if (_service) *_service = call->dest_srx.srx_service; mutex_unlock(&call->user_mutex); _leave(" = %d [%zu,%d]", ret, iov_iter_count(iter), *_abort); return ret; short_data: trace_rxrpc_abort(call->debug_id, rxrpc_recvmsg_short_data, call->cid, call->call_id, call->rx_consumed, 0, -EBADMSG); ret = -EBADMSG; goto out; excess_data: trace_rxrpc_abort(call->debug_id, rxrpc_recvmsg_excess_data, call->cid, call->call_id, call->rx_consumed, 0, -EMSGSIZE); ret = -EMSGSIZE; goto out; call_failed: *_abort = call->abort_code; ret = call->error; if (call->completion == RXRPC_CALL_SUCCEEDED) { ret = 1; if (iov_iter_count(iter) > 0) ret = -ECONNRESET; } goto out; } EXPORT_SYMBOL(rxrpc_kernel_recv_data); |
4070 4066 1333 469 469 469 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 | // SPDX-License-Identifier: GPL-2.0-or-later /* * vrf.c: device driver to encapsulate a VRF space * * Copyright (c) 2015 Cumulus Networks. All rights reserved. * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> * * Based on dummy, team and ipvlan drivers */ #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ip.h> #include <linux/init.h> #include <linux/moduleparam.h> #include <linux/netfilter.h> #include <linux/rtnetlink.h> #include <net/rtnetlink.h> #include <linux/u64_stats_sync.h> #include <linux/hashtable.h> #include <linux/spinlock_types.h> #include <linux/inetdevice.h> #include <net/arp.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #include <net/fib_rules.h> #include <net/netdev_lock.h> #include <net/sch_generic.h> #include <net/netns/generic.h> #include <net/netfilter/nf_conntrack.h> #include <net/inet_dscp.h> #define DRV_NAME "vrf" #define DRV_VERSION "1.1" #define FIB_RULE_PREF 1000 /* default preference for FIB rules */ #define HT_MAP_BITS 4 #define HASH_INITVAL ((u32)0xcafef00d) struct vrf_map { DECLARE_HASHTABLE(ht, HT_MAP_BITS); spinlock_t vmap_lock; /* shared_tables: * count how many distinct tables do not comply with the strict mode * requirement. * shared_tables value must be 0 in order to enable the strict mode. * * example of the evolution of shared_tables: * | time * add vrf0 --> table 100 shared_tables = 0 | t0 * add vrf1 --> table 101 shared_tables = 0 | t1 * add vrf2 --> table 100 shared_tables = 1 | t2 * add vrf3 --> table 100 shared_tables = 1 | t3 * add vrf4 --> table 101 shared_tables = 2 v t4 * * shared_tables is a "step function" (or "staircase function") * and it is increased by one when the second vrf is associated to a * table. * * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1. * * at t3, another dev (vrf3) is bound to the same table 100 but the * value of shared_tables is still 1. * This means that no matter how many new vrfs will register on the * table 100, the shared_tables will not increase (considering only * table 100). * * at t4, vrf4 is bound to table 101, and shared_tables = 2. * * Looking at the value of shared_tables we can immediately know if * the strict_mode can or cannot be enforced. Indeed, strict_mode * can be enforced iff shared_tables = 0. * * Conversely, shared_tables is decreased when a vrf is de-associated * from a table with exactly two associated vrfs. */ u32 shared_tables; bool strict_mode; }; struct vrf_map_elem { struct hlist_node hnode; struct list_head vrf_list; /* VRFs registered to this table */ u32 table_id; int users; int ifindex; }; static unsigned int vrf_net_id; /* per netns vrf data */ struct netns_vrf { /* protected by rtnl lock */ bool add_fib_rules; struct vrf_map vmap; struct ctl_table_header *ctl_hdr; }; struct net_vrf { struct rtable __rcu *rth; struct rt6_info __rcu *rt6; #if IS_ENABLED(CONFIG_IPV6) struct fib6_table *fib6_table; #endif u32 tb_id; struct list_head me_list; /* entry in vrf_map_elem */ int ifindex; }; static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) { vrf_dev->stats.tx_errors++; kfree_skb(skb); } static struct vrf_map *netns_vrf_map(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); return &nn_vrf->vmap; } static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev) { return netns_vrf_map(dev_net(dev)); } static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me) { struct list_head *me_head = &me->vrf_list; struct net_vrf *vrf; if (list_empty(me_head)) return -ENODEV; vrf = list_first_entry(me_head, struct net_vrf, me_list); return vrf->ifindex; } static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags) { struct vrf_map_elem *me; me = kmalloc(sizeof(*me), flags); if (!me) return NULL; return me; } static void vrf_map_elem_free(struct vrf_map_elem *me) { kfree(me); } static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id, int ifindex, int users) { me->table_id = table_id; me->ifindex = ifindex; me->users = users; INIT_LIST_HEAD(&me->vrf_list); } static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap, u32 table_id) { struct vrf_map_elem *me; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_for_each_possible(vmap->ht, me, hnode, key) { if (me->table_id == table_id) return me; } return NULL; } static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me) { u32 table_id = me->table_id; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_add(vmap->ht, &me->hnode, key); } static void vrf_map_del_elem(struct vrf_map_elem *me) { hash_del(&me->hnode); } static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock) { spin_lock(&vmap->vmap_lock); } static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock) { spin_unlock(&vmap->vmap_lock); } /* called with rtnl lock held */ static int vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); struct vrf_map_elem *new_me, *me; u32 table_id = vrf->tb_id; bool free_new_me = false; int users; int res; /* we pre-allocate elements used in the spin-locked section (so that we * keep the spinlock as short as possible). */ new_me = vrf_map_elem_alloc(GFP_KERNEL); if (!new_me) return -ENOMEM; vrf_map_elem_init(new_me, table_id, dev->ifindex, 0); vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) { me = new_me; vrf_map_add_elem(vmap, me); goto link_vrf; } /* we already have an entry in the vrf_map, so it means there is (at * least) a vrf registered on the specific table. */ free_new_me = true; if (vmap->strict_mode) { /* vrfs cannot share the same table */ NL_SET_ERR_MSG(extack, "Table is used by another VRF"); res = -EBUSY; goto unlock; } link_vrf: users = ++me->users; if (users == 2) ++vmap->shared_tables; list_add(&vrf->me_list, &me->vrf_list); res = 0; unlock: vrf_map_unlock(vmap); /* clean-up, if needed */ if (free_new_me) vrf_map_elem_free(new_me); return res; } /* called with rtnl lock held */ static void vrf_map_unregister_dev(struct net_device *dev) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); u32 table_id = vrf->tb_id; struct vrf_map_elem *me; int users; vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) goto unlock; list_del(&vrf->me_list); users = --me->users; if (users == 1) { --vmap->shared_tables; } else if (users == 0) { vrf_map_del_elem(me); /* no one will refer to this element anymore */ vrf_map_elem_free(me); } unlock: vrf_map_unlock(vmap); } /* return the vrf device index associated with the table_id */ static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id) { struct vrf_map *vmap = netns_vrf_map(net); struct vrf_map_elem *me; int ifindex; vrf_map_lock(vmap); if (!vmap->strict_mode) { ifindex = -EPERM; goto unlock; } me = vrf_map_lookup_elem(vmap, table_id); if (!me) { ifindex = -ENODEV; goto unlock; } ifindex = vrf_map_elem_get_vrf_ifindex(me); unlock: vrf_map_unlock(vmap); return ifindex; } /* by default VRF devices do not have a qdisc and are expected * to be created with only a single queue. */ static bool qdisc_tx_is_default(const struct net_device *dev) { struct netdev_queue *txq; struct Qdisc *qdisc; if (dev->num_tx_queues > 1) return false; txq = netdev_get_tx_queue(dev, 0); qdisc = rcu_access_pointer(txq->qdisc); return !qdisc->enqueue; } /* Local traffic destined to local address. Reinsert the packet to rx * path, similar to loopback handling. */ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst) { unsigned int len = skb->len; skb_orphan(skb); skb_dst_set(skb, dst); /* set pkt_type to avoid skb hitting packet taps twice - * once on Tx and again in Rx processing */ skb->pkt_type = PACKET_LOOPBACK; skb->protocol = eth_type_trans(skb, dev); if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) dev_dstats_rx_add(dev, len); else dev_dstats_rx_dropped(dev); return NETDEV_TX_OK; } static void vrf_nf_set_untracked(struct sk_buff *skb) { if (skb_get_nfct(skb) == 0) nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } static void vrf_nf_reset_ct(struct sk_buff *skb) { if (skb_get_nfct(skb) == IP_CT_UNTRACKED) nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) static int vrf_ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { const struct ipv6hdr *iph; struct net *net = dev_net(skb->dev); struct flowi6 fl6; int ret = NET_XMIT_DROP; struct dst_entry *dst; struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) goto err; iph = ipv6_hdr(skb); memset(&fl6, 0, sizeof(fl6)); /* needed to match OIF rule */ fl6.flowi6_l3mdev = dev->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = iph->nexthdr; dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst) || dst == dst_null) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (dst->dev == dev) return vrf_local_xmit(skb, dev, dst); skb_dst_set(skb, dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); ret = vrf_ip6_local_out(net, skb->sk, skb); if (unlikely(net_xmit_eval(ret))) dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; return ret; err: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #else static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #endif /* based on ip_local_out; can't use it b/c the dst is switched pointing to us */ static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, struct net_device *vrf_dev) { struct iphdr *ip4h; int ret = NET_XMIT_DROP; struct flowi4 fl4; struct net *net = dev_net(vrf_dev); struct rtable *rt; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) goto err; ip4h = ip_hdr(skb); memset(&fl4, 0, sizeof(fl4)); /* needed to match OIF rule */ fl4.flowi4_l3mdev = vrf_dev->ifindex; fl4.flowi4_iif = LOOPBACK_IFINDEX; fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = ip4h->protocol; fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (rt->dst.dev == vrf_dev) return vrf_local_xmit(skb, vrf_dev, &rt->dst); skb_dst_set(skb, &rt->dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); if (!ip4h->saddr) { ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, RT_SCOPE_LINK); } memset(IPCB(skb), 0, sizeof(*IPCB(skb))); ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(net_xmit_eval(ret))) vrf_dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; out: return ret; err: vrf_tx_error(vrf_dev, skb); goto out; } static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) { switch (skb->protocol) { case htons(ETH_P_IP): return vrf_process_v4_outbound(skb, dev); case htons(ETH_P_IPV6): return vrf_process_v6_outbound(skb, dev); default: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } } static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned int len = skb->len; netdev_tx_t ret; ret = is_ip_tx_frame(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) dev_dstats_tx_add(dev, len); else dev_dstats_tx_dropped(dev); return ret; } static void vrf_finish_direct(struct sk_buff *skb) { struct net_device *vrf_dev = skb->dev; if (!list_empty(&vrf_dev->ptype_all) && likely(skb_headroom(skb) >= ETH_HLEN)) { struct ethhdr *eth = skb_push(skb, ETH_HLEN); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth_zero_addr(eth->h_dest); eth->h_proto = skb->protocol; rcu_read_lock_bh(); dev_queue_xmit_nit(skb, vrf_dev); rcu_read_unlock_bh(); skb_pull(skb, ETH_HLEN); } vrf_nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) /* modelled after ip6_finish_output2 */ static int vrf_finish_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; const struct in6_addr *nexthop; struct neighbour *neigh; int ret; vrf_nf_reset_ct(skb); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; rcu_read_lock(); nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); rcu_read_unlock(); return ret; } rcu_read_unlock(); IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EINVAL; } /* modelled after ip6_output */ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb_dst(skb)->dev, vrf_finish_output6, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rt6_info *rt6; rcu_read_lock(); rt6 = rcu_dereference(vrf->rt6); if (likely(rt6)) { dst = &rt6->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output6_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip6_local_out(net, sk, skb); } static int vrf_output6_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IPV6); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output6_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip6_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); if (likely(err == 1)) err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert link scope packets */ if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return vrf_ip6_out_direct(vrf_dev, sk, skb); return vrf_ip6_out_redirect(vrf_dev, skb); } /* holding rtnl */ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { struct rt6_info *rt6 = rtnl_dereference(vrf->rt6); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rt6, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rt6) { dst = &rt6->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rt6_create(struct net_device *dev) { int flags = DST_NOPOLICY | DST_NOXFRM; struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); struct rt6_info *rt6; int rc = -ENOMEM; /* IPv6 can be CONFIG enabled and then disabled runtime */ if (!ipv6_mod_enabled()) return 0; vrf->fib6_table = fib6_new_table(net, vrf->tb_id); if (!vrf->fib6_table) goto out; /* create a dst for routing packets out a VRF device */ rt6 = ip6_dst_alloc(net, dev, flags); if (!rt6) goto out; rt6->dst.output = vrf_output6; rcu_assign_pointer(vrf->rt6, rt6); rc = 0; out: return rc; } #else static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { return skb; } static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { } static int vrf_rt6_create(struct net_device *dev) { return 0; } #endif /* modelled after ip_finish_output2 */ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; vrf_nf_reset_ct(skb); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) { dev->stats.tx_errors++; return -ENOMEM; } } rcu_read_lock(); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int ret; sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */ ret = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock(); return ret; } rcu_read_unlock(); vrf_tx_error(skb->dev, skb); return -EINVAL; } static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, vrf_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rtable *rth; rcu_read_lock(); rth = rcu_dereference(vrf->rth); if (likely(rth)) { dst = &rth->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip_local_out(net, sk, skb); } static int vrf_output_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IP); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip_out_direct_finish); if (likely(err == 1)) err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert multicast or local broadcast */ if (ipv4_is_multicast(ip_hdr(skb)->daddr) || ipv4_is_lbcast(ip_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return vrf_ip_out_direct(vrf_dev, sk, skb); return vrf_ip_out_redirect(vrf_dev, skb); } /* called with rcu lock held */ static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_out(vrf_dev, sk, skb); case AF_INET6: return vrf_ip6_out(vrf_dev, sk, skb); } return skb; } /* holding rtnl */ static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf) { struct rtable *rth = rtnl_dereference(vrf->rth); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rth, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rth) { dst = &rth->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rtable_create(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); struct rtable *rth; if (!fib_new_table(dev_net(dev), vrf->tb_id)) return -ENOMEM; /* create a dst for routing packets out through a VRF device */ rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1); if (!rth) return -ENOMEM; rth->dst.output = vrf_output; rcu_assign_pointer(vrf->rth, rth); return 0; } /**************************** device handling ********************/ /* cycle interface to flush neighbor cache and move routes across tables */ static void cycle_netdev(struct net_device *dev, struct netlink_ext_ack *extack) { unsigned int flags = dev->flags; int ret; if (!netif_running(dev)) return; ret = dev_change_flags(dev, flags & ~IFF_UP, extack); if (ret >= 0) ret = dev_change_flags(dev, flags, extack); if (ret < 0) { netdev_err(dev, "Failed to cycle device %s; route tables might be wrong!\n", dev->name); } } static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { int ret; /* do not allow loopback device to be enslaved to a VRF. * The vrf device acts as the loopback for the vrf. */ if (port_dev == dev_net(dev)->loopback_dev) { NL_SET_ERR_MSG(extack, "Can not enslave loopback device to a VRF"); return -EOPNOTSUPP; } port_dev->priv_flags |= IFF_L3MDEV_SLAVE; ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack); if (ret < 0) goto err; cycle_netdev(port_dev, extack); return 0; err: port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; return ret; } static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { if (netif_is_l3_master(port_dev)) { NL_SET_ERR_MSG(extack, "Can not enslave an L3 master device to a VRF"); return -EINVAL; } if (netif_is_l3_slave(port_dev)) return -EINVAL; return do_vrf_add_slave(dev, port_dev, extack); } /* inverse of do_vrf_add_slave */ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { netdev_upper_dev_unlink(port_dev, dev); port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; cycle_netdev(port_dev, NULL); return 0; } static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { return do_vrf_del_slave(dev, port_dev); } static void vrf_dev_uninit(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); vrf_rtable_release(dev, vrf); vrf_rt6_release(dev, vrf); } static int vrf_dev_init(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); /* create the default dst which points back to us */ if (vrf_rtable_create(dev) != 0) goto out_nomem; if (vrf_rt6_create(dev) != 0) goto out_rth; dev->flags = IFF_MASTER | IFF_NOARP; /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; netdev_lockdep_set_classes(dev); return 0; out_rth: vrf_rtable_release(dev, vrf); out_nomem: return -ENOMEM; } static const struct net_device_ops vrf_netdev_ops = { .ndo_init = vrf_dev_init, .ndo_uninit = vrf_dev_uninit, .ndo_start_xmit = vrf_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_add_slave = vrf_add_slave, .ndo_del_slave = vrf_del_slave, }; static u32 vrf_fib_table(const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return vrf->tb_id; } static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook, struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1) skb = NULL; /* kfree_skb(skb) handled by nf code */ return skb; } static int vrf_prepare_mac_header(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto) { struct ethhdr *eth; int err; /* in general, we do not know if there is enough space in the head of * the packet for hosting the mac header. */ err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev)); if (unlikely(err)) /* no space in the skb head */ return -ENOBUFS; __skb_push(skb, ETH_HLEN); eth = (struct ethhdr *)skb->data; skb_reset_mac_header(skb); skb_reset_mac_len(skb); /* we set the ethernet destination and the source addresses to the * address of the VRF device. */ ether_addr_copy(eth->h_dest, vrf_dev->dev_addr); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth->h_proto = htons(proto); /* the destination address of the Ethernet frame corresponds to the * address set on the VRF interface; therefore, the packet is intended * to be processed locally. */ skb->protocol = eth->h_proto; skb->pkt_type = PACKET_HOST; skb_postpush_rcsum(skb, skb->data, ETH_HLEN); skb_pull_inline(skb, ETH_HLEN); return 0; } /* prepare and add the mac header to the packet if it was not set previously. * In this way, packet sniffers such as tcpdump can parse the packet correctly. * If the mac header was already set, the original mac header is left * untouched and the function returns immediately. */ static int vrf_add_mac_header_if_unset(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto, struct net_device *orig_dev) { if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev)) return 0; return vrf_prepare_mac_header(skb, vrf_dev, proto); } #if IS_ENABLED(CONFIG_IPV6) /* neighbor handling is done with actual device; do not want * to flip skb->dev for those ndisc packets. This really fails * for multiple next protocols (e.g., NEXTHDR_HOP). But it is * a start. */ static bool ipv6_ndisc_frame(const struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); bool rc = false; if (iph->nexthdr == NEXTHDR_ICMP) { const struct icmp6hdr *icmph; struct icmp6hdr _icmph; icmph = skb_header_pointer(skb, sizeof(*iph), sizeof(_icmph), &_icmph); if (!icmph) goto out; switch (icmph->icmp6_type) { case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: rc = true; break; } } out: return rc; } static struct rt6_info *vrf_ip6_route_lookup(struct net *net, const struct net_device *dev, struct flowi6 *fl6, int ifindex, const struct sk_buff *skb, int flags) { struct net_vrf *vrf = netdev_priv(dev); return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags); } static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, int ifindex) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct flowi6 fl6 = { .flowi6_iif = ifindex, .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), }; struct net *net = dev_net(vrf_dev); struct rt6_info *rt6; rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb, RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); if (unlikely(!rt6)) return; if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst)) return; skb_dst_set(skb, &rt6->dst); } static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); bool is_ndisc = ipv6_ndisc_frame(skb); /* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. * For non-loopback strict packets, determine the dst using the original * ifindex. */ if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; if (skb->pkt_type == PACKET_LOOPBACK) skb->pkt_type = PACKET_HOST; else vrf_ip6_input_dst(skb, vrf_dev, orig_iif); goto out; } /* if packet is NDISC then keep the ingress interface */ if (!is_ndisc) { struct net_device *orig_dev = skb->dev; dev_dstats_rx_add(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IPV6, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } IP6CB(skb)->flags |= IP6SKB_L3SLAVE; } if (need_strict) vrf_ip6_input_dst(skb, vrf_dev, orig_iif); skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } #else static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { return skb; } #endif static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_device *orig_dev = skb->dev; skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IPCB(skb)->flags |= IPSKB_L3SLAVE; if (ipv4_is_multicast(ip_hdr(skb)->daddr)) goto out; /* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb */ if (skb->pkt_type == PACKET_LOOPBACK) { skb->pkt_type = PACKET_HOST; goto out; } dev_dstats_rx_add(vrf_dev, skb->len); if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } /* called with rcu lock held */ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_rcv(vrf_dev, skb); case AF_INET6: return vrf_ip6_rcv(vrf_dev, skb); } return skb; } #if IS_ENABLED(CONFIG_IPV6) /* send to link-local or multicast address via interface enslaved to * VRF device. Force lookup to VRF table without changing flow struct * Note: Caller to this function must hold rcu_read_lock() and no refcnt * is taken on the dst by this function. */ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, struct flowi6 *fl6) { struct net *net = dev_net(dev); int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; struct dst_entry *dst = NULL; struct rt6_info *rt; /* VRF device does not have a link-local address and * sending packets to link-local or mcast addresses over * a VRF device does not make sense */ if (fl6->flowi6_oif == dev->ifindex) { dst = &net->ipv6.ip6_null_entry->dst; return dst; } if (!ipv6_addr_any(&fl6->saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags); if (rt) dst = &rt->dst; return dst; } #endif static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_fib_table = vrf_fib_table, .l3mdev_l3_rcv = vrf_l3_rcv, .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) .l3mdev_link_scope_lookup = vrf_link_scope_lookup, #endif }; static void vrf_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); } static const struct ethtool_ops vrf_ethtool_ops = { .get_drvinfo = vrf_get_drvinfo, }; static inline size_t vrf_fib_rule_nl_size(void) { size_t sz; sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)); sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */ sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */ sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */ return sz; } static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) { struct fib_rule_hdr *frh; struct nlmsghdr *nlh; struct sk_buff *skb; int err; if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) && !ipv6_mod_enabled()) return 0; skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0); if (!nlh) goto nla_put_failure; /* rule only needs to appear once */ nlh->nlmsg_flags |= NLM_F_EXCL; frh = nlmsg_data(nlh); memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL)) goto nla_put_failure; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF)) goto nla_put_failure; nlmsg_end(skb, nlh); if (add_it) { err = fib_newrule(dev_net(dev), skb, nlh, NULL, true); if (err == -EEXIST) err = 0; } else { err = fib_delrule(dev_net(dev), skb, nlh, NULL, true); if (err == -ENOENT) err = 0; } nlmsg_free(skb); return err; nla_put_failure: nlmsg_free(skb); return -EMSGSIZE; } static int vrf_add_fib_rules(const struct net_device *dev) { int err; err = vrf_fib_rule(dev, AF_INET, true); if (err < 0) goto out_err; err = vrf_fib_rule(dev, AF_INET6, true); if (err < 0) goto ipv6_err; #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true); if (err < 0) goto ipmr_err; #endif #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); if (err < 0) goto ip6mr_err; #endif return 0; #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) ip6mr_err: vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); #endif #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) ipmr_err: vrf_fib_rule(dev, AF_INET6, false); #endif ipv6_err: vrf_fib_rule(dev, AF_INET, false); out_err: netdev_err(dev, "Failed to add FIB rules.\n"); return err; } static void vrf_setup(struct net_device *dev) { ether_setup(dev); /* Initialize the device structure. */ dev->netdev_ops = &vrf_netdev_ops; dev->l3mdev_ops = &vrf_l3mdev_ops; dev->ethtool_ops = &vrf_ethtool_ops; dev->needs_free_netdev = true; /* Fill in device structure with ethernet-generic values. */ eth_hw_addr_random(dev); /* don't acquire vrf device's netif_tx_lock when transmitting */ dev->lltx = true; /* don't allow vrf devices to change network namespaces. */ dev->netns_immutable = true; /* does not make sense for a VLAN to be added to a vrf device */ dev->features |= NETIF_F_VLAN_CHALLENGED; /* enable offload features */ dev->features |= NETIF_F_GSO_SOFTWARE; dev->features |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC; dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->hw_features = dev->features; dev->hw_enc_features = dev->features; /* default to no qdisc; user can add if desired */ dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_NO_RX_HANDLER; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; /* VRF devices do not care about MTU, but if the MTU is set * too low then the ipv4 and ipv6 protocols are disabled * which breaks networking. */ dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU; dev->mtu = dev->max_mtu; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EADDRNOTAVAIL; } } return 0; } static void vrf_dellink(struct net_device *dev, struct list_head *head) { struct net_device *port_dev; struct list_head *iter; netdev_for_each_lower_dev(dev, port_dev, iter) vrf_del_slave(dev, port_dev); vrf_map_unregister_dev(dev); unregister_netdevice_queue(dev, head); } static int vrf_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net_vrf *vrf = netdev_priv(dev); struct nlattr **data = params->data; struct netns_vrf *nn_vrf; bool *add_fib_rules; struct net *net; int err; if (!data || !data[IFLA_VRF_TABLE]) { NL_SET_ERR_MSG(extack, "VRF table id is missing"); return -EINVAL; } vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); if (vrf->tb_id == RT_TABLE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE], "Invalid VRF table id"); return -EINVAL; } dev->priv_flags |= IFF_L3MDEV_MASTER; err = register_netdevice(dev); if (err) goto out; /* mapping between table_id and vrf; * note: such binding could not be done in the dev init function * because dev->ifindex id is not available yet. */ vrf->ifindex = dev->ifindex; err = vrf_map_register_dev(dev, extack); if (err) { unregister_netdevice(dev); goto out; } net = dev_net(dev); nn_vrf = net_generic(net, vrf_net_id); add_fib_rules = &nn_vrf->add_fib_rules; if (*add_fib_rules) { err = vrf_add_fib_rules(dev); if (err) { vrf_map_unregister_dev(dev); unregister_netdevice(dev); goto out; } *add_fib_rules = false; } out: return err; } static size_t vrf_nl_getsize(const struct net_device *dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */ } static int vrf_fillinfo(struct sk_buff *skb, const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id); } static size_t vrf_get_slave_size(const struct net_device *bond_dev, const struct net_device *slave_dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_PORT_TABLE */ } static int vrf_fill_slave_info(struct sk_buff *skb, const struct net_device *vrf_dev, const struct net_device *slave_dev) { struct net_vrf *vrf = netdev_priv(vrf_dev); if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id)) return -EMSGSIZE; return 0; } static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = { [IFLA_VRF_TABLE] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vrf_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct net_vrf), .get_size = vrf_nl_getsize, .policy = vrf_nl_policy, .validate = vrf_validate, .fill_info = vrf_fillinfo, .get_slave_size = vrf_get_slave_size, .fill_slave_info = vrf_fill_slave_info, .newlink = vrf_newlink, .dellink = vrf_dellink, .setup = vrf_setup, .maxtype = IFLA_VRF_MAX, }; static int vrf_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* only care about unregister events to drop slave references */ if (event == NETDEV_UNREGISTER) { struct net_device *vrf_dev; if (!netif_is_l3_slave(dev)) goto out; vrf_dev = netdev_master_upper_dev_get(dev); vrf_del_slave(vrf_dev, dev); } out: return NOTIFY_DONE; } static struct notifier_block vrf_notifier_block __read_mostly = { .notifier_call = vrf_device_event, }; static int vrf_map_init(struct vrf_map *vmap) { spin_lock_init(&vmap->vmap_lock); hash_init(vmap->ht); vmap->strict_mode = false; return 0; } #ifdef CONFIG_SYSCTL static bool vrf_strict_mode(struct vrf_map *vmap) { bool strict_mode; vrf_map_lock(vmap); strict_mode = vmap->strict_mode; vrf_map_unlock(vmap); return strict_mode; } static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode) { bool *cur_mode; int res = 0; vrf_map_lock(vmap); cur_mode = &vmap->strict_mode; if (*cur_mode == new_mode) goto unlock; if (*cur_mode) { /* disable strict mode */ *cur_mode = false; } else { if (vmap->shared_tables) { /* we cannot allow strict_mode because there are some * vrfs that share one or more tables. */ res = -EBUSY; goto unlock; } /* no tables are shared among vrfs, so we can go back * to 1:1 association between a vrf with its table. */ *cur_mode = true; } unlock: vrf_map_unlock(vmap); return res; } static int vrf_shared_table_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)table->extra1; struct vrf_map *vmap = netns_vrf_map(net); int proc_strict_mode = 0; struct ctl_table tmp = { .procname = table->procname, .data = &proc_strict_mode, .maxlen = sizeof(int), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }; int ret; if (!write) proc_strict_mode = vrf_strict_mode(vmap); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode); return ret; } static const struct ctl_table vrf_table[] = { { .procname = "strict_mode", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = vrf_shared_table_handler, /* set by the vrf_netns_init */ .extra1 = NULL, }, }; static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { struct ctl_table *table; table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL); if (!table) return -ENOMEM; /* init the extra1 parameter with the reference to current netns */ table[0].extra1 = net; nn_vrf->ctl_hdr = register_net_sysctl_sz(net, "net/vrf", table, ARRAY_SIZE(vrf_table)); if (!nn_vrf->ctl_hdr) { kfree(table); return -ENOMEM; } return 0; } static void vrf_netns_exit_sysctl(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); const struct ctl_table *table; table = nn_vrf->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(nn_vrf->ctl_hdr); kfree(table); } #else static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { return 0; } static void vrf_netns_exit_sysctl(struct net *net) { } #endif /* Initialize per network namespace state */ static int __net_init vrf_netns_init(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); nn_vrf->add_fib_rules = true; vrf_map_init(&nn_vrf->vmap); return vrf_netns_init_sysctl(net, nn_vrf); } static void __net_exit vrf_netns_exit(struct net *net) { vrf_netns_exit_sysctl(net); } static struct pernet_operations vrf_net_ops __net_initdata = { .init = vrf_netns_init, .exit = vrf_netns_exit, .id = &vrf_net_id, .size = sizeof(struct netns_vrf), }; static int __init vrf_init_module(void) { int rc; register_netdevice_notifier(&vrf_notifier_block); rc = register_pernet_subsys(&vrf_net_ops); if (rc < 0) goto error; rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); if (rc < 0) goto unreg_pernet; rc = rtnl_link_register(&vrf_link_ops); if (rc < 0) goto table_lookup_unreg; return 0; table_lookup_unreg: l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); unreg_pernet: unregister_pernet_subsys(&vrf_net_ops); error: unregister_netdevice_notifier(&vrf_notifier_block); return rc; } module_init(vrf_init_module); MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); MODULE_VERSION(DRV_VERSION); |
23 23 23 20 22 21 19 3 7 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | /* * Cryptographic API. * * MD5 Message Digest Algorithm (RFC1321). * * Derived from cryptoapi implementation, originally based on the * public domain implementation written by Colin Plumb in 1993. * * Copyright (c) Cryptoapi developers. * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #include <crypto/internal/hash.h> #include <crypto/md5.h> #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> #include <linux/types.h> #include <asm/byteorder.h> const u8 md5_zero_message_hash[MD5_DIGEST_SIZE] = { 0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04, 0xe9, 0x80, 0x09, 0x98, 0xec, 0xf8, 0x42, 0x7e, }; EXPORT_SYMBOL_GPL(md5_zero_message_hash); #define F1(x, y, z) (z ^ (x & (y ^ z))) #define F2(x, y, z) F1(z, x, y) #define F3(x, y, z) (x ^ y ^ z) #define F4(x, y, z) (y ^ (x | ~z)) #define MD5STEP(f, w, x, y, z, in, s) \ (w += f(x, y, z) + in, w = (w<<s | w>>(32-s)) + x) static void md5_transform(__u32 *hash, __u32 const *in) { u32 a, b, c, d; a = hash[0]; b = hash[1]; c = hash[2]; d = hash[3]; MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); hash[0] += a; hash[1] += b; hash[2] += c; hash[3] += d; } static inline void md5_transform_helper(struct md5_state *ctx) { le32_to_cpu_array(ctx->block, sizeof(ctx->block) / sizeof(u32)); md5_transform(ctx->hash, ctx->block); } static int md5_init(struct shash_desc *desc) { struct md5_state *mctx = shash_desc_ctx(desc); mctx->hash[0] = MD5_H0; mctx->hash[1] = MD5_H1; mctx->hash[2] = MD5_H2; mctx->hash[3] = MD5_H3; mctx->byte_count = 0; return 0; } static int md5_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct md5_state *mctx = shash_desc_ctx(desc); const u32 avail = sizeof(mctx->block) - (mctx->byte_count & 0x3f); mctx->byte_count += len; if (avail > len) { memcpy((char *)mctx->block + (sizeof(mctx->block) - avail), data, len); return 0; } memcpy((char *)mctx->block + (sizeof(mctx->block) - avail), data, avail); md5_transform_helper(mctx); data += avail; len -= avail; while (len >= sizeof(mctx->block)) { memcpy(mctx->block, data, sizeof(mctx->block)); md5_transform_helper(mctx); data += sizeof(mctx->block); len -= sizeof(mctx->block); } memcpy(mctx->block, data, len); return 0; } static int md5_final(struct shash_desc *desc, u8 *out) { struct md5_state *mctx = shash_desc_ctx(desc); const unsigned int offset = mctx->byte_count & 0x3f; char *p = (char *)mctx->block + offset; int padding = 56 - (offset + 1); *p++ = 0x80; if (padding < 0) { memset(p, 0x00, padding + sizeof (u64)); md5_transform_helper(mctx); p = (char *)mctx->block; padding = 56; } memset(p, 0, padding); mctx->block[14] = mctx->byte_count << 3; mctx->block[15] = mctx->byte_count >> 29; le32_to_cpu_array(mctx->block, (sizeof(mctx->block) - sizeof(u64)) / sizeof(u32)); md5_transform(mctx->hash, mctx->block); cpu_to_le32_array(mctx->hash, sizeof(mctx->hash) / sizeof(u32)); memcpy(out, mctx->hash, sizeof(mctx->hash)); memset(mctx, 0, sizeof(*mctx)); return 0; } static int md5_export(struct shash_desc *desc, void *out) { struct md5_state *ctx = shash_desc_ctx(desc); memcpy(out, ctx, sizeof(*ctx)); return 0; } static int md5_import(struct shash_desc *desc, const void *in) { struct md5_state *ctx = shash_desc_ctx(desc); memcpy(ctx, in, sizeof(*ctx)); return 0; } static struct shash_alg alg = { .digestsize = MD5_DIGEST_SIZE, .init = md5_init, .update = md5_update, .final = md5_final, .export = md5_export, .import = md5_import, .descsize = sizeof(struct md5_state), .statesize = sizeof(struct md5_state), .base = { .cra_name = "md5", .cra_driver_name = "md5-generic", .cra_blocksize = MD5_HMAC_BLOCK_SIZE, .cra_module = THIS_MODULE, } }; static int __init md5_mod_init(void) { return crypto_register_shash(&alg); } static void __exit md5_mod_fini(void) { crypto_unregister_shash(&alg); } subsys_initcall(md5_mod_init); module_exit(md5_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD5 Message Digest Algorithm"); MODULE_ALIAS_CRYPTO("md5"); |
2 35 36 35 35 2 16 3 13 9 1 8 5 3 5 3 6 2 6 2 6 2 8 8 10 1 7 1 15 15 14 15 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |