Total coverage: 144239 (8%)of 1820974
80 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 /* * include/linux/topology.h * * Written by: Matthew Dobson, IBM Corporation * * Copyright (C) 2002, IBM Corp. * * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Send feedback to <colpatch@us.ibm.com> */ #ifndef _LINUX_TOPOLOGY_H #define _LINUX_TOPOLOGY_H #include <linux/arch_topology.h> #include <linux/cpumask.h> #include <linux/bitops.h> #include <linux/mmzone.h> #include <linux/smp.h> #include <linux/percpu.h> #include <asm/topology.h> #ifndef nr_cpus_node #define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node)) #endif #define for_each_node_with_cpus(node) \ for_each_online_node(node) \ if (nr_cpus_node(node)) int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 #define DISTANCE_BITS 8 #ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif #ifndef RECLAIM_DISTANCE /* * If the distance between nodes in a system is larger than RECLAIM_DISTANCE * (in whatever arch specific measurement units returned by node_distance()) * and node_reclaim_mode is enabled then the VM will only call node_reclaim() * on nodes within this distance. */ #define RECLAIM_DISTANCE 30 #endif /* * The following tunable allows platforms to override the default node * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are * sufficiently fast that the default value actually hurts * performance. * * AMD EPYC machines use this because even though the 2-hop distance * is 32 (3.2x slower than a local memory access) performance actually * *improves* if allowed to reclaim memory and load balance tasks * between NUMA nodes 2-hops apart. */ extern int __read_mostly node_reclaim_distance; #ifndef PENALTY_FOR_NODE_WITH_CPUS #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DECLARE_PER_CPU(int, numa_node); #ifndef numa_node_id /* Returns the number of the current Node. */ static inline int numa_node_id(void) { return raw_cpu_read(numa_node); } #endif #ifndef cpu_to_node static inline int cpu_to_node(int cpu) { return per_cpu(numa_node, cpu); } #endif #ifndef set_numa_node static inline void set_numa_node(int node) { this_cpu_write(numa_node, node); } #endif #ifndef set_cpu_numa_node static inline void set_cpu_numa_node(int cpu, int node) { per_cpu(numa_node, cpu) = node; } #endif #else /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */ /* Returns the number of the current Node. */ #ifndef numa_node_id static inline int numa_node_id(void) { return cpu_to_node(raw_smp_processor_id()); } #endif #endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */ #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). */ DECLARE_PER_CPU(int, _numa_mem_); #ifndef set_numa_mem static inline void set_numa_mem(int node) { this_cpu_write(_numa_mem_, node); } #endif #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return raw_cpu_read(_numa_mem_); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return per_cpu(_numa_mem_, cpu); } #endif #ifndef set_cpu_numa_mem static inline void set_cpu_numa_mem(int cpu, int node) { per_cpu(_numa_mem_, cpu) = node; } #endif #else /* !CONFIG_HAVE_MEMORYLESS_NODES */ #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return numa_node_id(); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return cpu_to_node(cpu); } #endif #endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */ #if defined(topology_die_id) && defined(topology_die_cpumask) #define TOPOLOGY_DIE_SYSFS #endif #if defined(topology_cluster_id) && defined(topology_cluster_cpumask) #define TOPOLOGY_CLUSTER_SYSFS #endif #if defined(topology_book_id) && defined(topology_book_cpumask) #define TOPOLOGY_BOOK_SYSFS #endif #if defined(topology_drawer_id) && defined(topology_drawer_cpumask) #define TOPOLOGY_DRAWER_SYSFS #endif #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_die_id #define topology_die_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_cluster_id #define topology_cluster_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif #ifndef topology_book_id #define topology_book_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_drawer_id #define topology_drawer_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_ppin #define topology_ppin(cpu) ((void)(cpu), 0ull) #endif #ifndef topology_sibling_cpumask #define topology_sibling_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_cluster_cpumask #define topology_cluster_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_die_cpumask #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_book_cpumask #define topology_book_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_drawer_cpumask #define topology_drawer_cpumask(cpu) cpumask_of(cpu) #endif #if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask) static inline const struct cpumask *cpu_smt_mask(int cpu) { return topology_sibling_cpumask(cpu); } #endif static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } #ifdef CONFIG_NUMA int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node); extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops); #else static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { return cpumask_nth_and(cpu, cpus, cpu_online_mask); } static inline const struct cpumask * sched_numa_hop_mask(unsigned int node, unsigned int hops) { return ERR_PTR(-EOPNOTSUPP); } #endif /* CONFIG_NUMA */ /** * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance * from a given node. * @mask: the iteration variable. * @node: the NUMA node to start the search from. * * Requires rcu_lock to be held. * * Yields cpu_online_mask for @node == NUMA_NO_NODE. */ #define for_each_numa_hop_mask(mask, node) \ for (unsigned int __hops = 0; \ mask = (node != NUMA_NO_NODE || __hops) ? \ sched_numa_hop_mask(node, __hops) : \ cpu_online_mask, \ !IS_ERR_OR_NULL(mask); \ __hops++) #endif /* _LINUX_TOPOLOGY_H */
7 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 // SPDX-License-Identifier: GPL-2.0-or-later /* * Contiguous Memory Allocator * * Copyright (c) 2010-2011 by Samsung Electronics. * Copyright IBM Corporation, 2013 * Copyright LG Electronics Inc., 2014 * Written by: * Marek Szyprowski <m.szyprowski@samsung.com> * Michal Nazarewicz <mina86@mina86.com> * Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * Joonsoo Kim <iamjoonsoo.kim@lge.com> */ #define pr_fmt(fmt) "cma: " fmt #define CREATE_TRACE_POINTS #include <linux/memblock.h> #include <linux/err.h> #include <linux/mm.h> #include <linux/sizes.h> #include <linux/slab.h> #include <linux/log2.h> #include <linux/cma.h> #include <linux/highmem.h> #include <linux/io.h> #include <linux/kmemleak.h> #include <trace/events/cma.h> #include "internal.h" #include "cma.h" struct cma cma_areas[MAX_CMA_AREAS]; unsigned int cma_area_count; static DEFINE_MUTEX(cma_mutex); phys_addr_t cma_get_base(const struct cma *cma) { return PFN_PHYS(cma->base_pfn); } unsigned long cma_get_size(const struct cma *cma) { return cma->count << PAGE_SHIFT; } const char *cma_get_name(const struct cma *cma) { return cma->name; } static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, unsigned int align_order) { if (align_order <= cma->order_per_bit) return 0; return (1UL << (align_order - cma->order_per_bit)) - 1; } /* * Find the offset of the base PFN from the specified align_order. * The value returned is represented in order_per_bits. */ static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, unsigned int align_order) { return (cma->base_pfn & ((1UL << align_order) - 1)) >> cma->order_per_bit; } static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, unsigned long pages) { return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; } static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, unsigned long count) { unsigned long bitmap_no, bitmap_count; unsigned long flags; bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; bitmap_count = cma_bitmap_pages_to_bits(cma, count); spin_lock_irqsave(&cma->lock, flags); bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); spin_unlock_irqrestore(&cma->lock, flags); } static void __init cma_activate_area(struct cma *cma) { unsigned long base_pfn = cma->base_pfn, pfn; struct zone *zone; cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL); if (!cma->bitmap) goto out_error; /* * alloc_contig_range() requires the pfn range specified to be in the * same zone. Simplify by forcing the entire CMA resv range to be in the * same zone. */ WARN_ON_ONCE(!pfn_valid(base_pfn)); zone = page_zone(pfn_to_page(base_pfn)); for (pfn = base_pfn + 1; pfn < base_pfn + cma->count; pfn++) { WARN_ON_ONCE(!pfn_valid(pfn)); if (page_zone(pfn_to_page(pfn)) != zone) goto not_in_zone; } for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn += pageblock_nr_pages) init_cma_reserved_pageblock(pfn_to_page(pfn)); spin_lock_init(&cma->lock); #ifdef CONFIG_CMA_DEBUGFS INIT_HLIST_HEAD(&cma->mem_head); spin_lock_init(&cma->mem_head_lock); #endif return; not_in_zone: bitmap_free(cma->bitmap); out_error: /* Expose all pages to the buddy, they are useless for CMA. */ if (!cma->reserve_pages_on_error) { for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) free_reserved_page(pfn_to_page(pfn)); } totalcma_pages -= cma->count; cma->count = 0; pr_err("CMA area %s could not be activated\n", cma->name); } static int __init cma_init_reserved_areas(void) { int i; for (i = 0; i < cma_area_count; i++) cma_activate_area(&cma_areas[i]); return 0; } core_initcall(cma_init_reserved_areas); void __init cma_reserve_pages_on_error(struct cma *cma) { cma->reserve_pages_on_error = true; } /** * cma_init_reserved_mem() - create custom contiguous area from reserved memory * @base: Base address of the reserved area * @size: Size of the reserved area (in bytes), * @order_per_bit: Order of pages represented by one bit on bitmap. * @name: The name of the area. If this parameter is NULL, the name of * the area will be set to "cmaN", where N is a running counter of * used areas. * @res_cma: Pointer to store the created cma region. * * This function creates custom contiguous area from already reserved memory. */ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, const char *name, struct cma **res_cma) { struct cma *cma; /* Sanity checks */ if (cma_area_count == ARRAY_SIZE(cma_areas)) { pr_err("Not enough slots for CMA reserved regions!\n"); return -ENOSPC; } if (!size || !memblock_is_region_reserved(base, size)) return -EINVAL; /* * CMA uses CMA_MIN_ALIGNMENT_BYTES as alignment requirement which * needs pageblock_order to be initialized. Let's enforce it. */ if (!pageblock_order) { pr_err("pageblock_order not yet initialized. Called during early boot?\n"); return -EINVAL; } /* ensure minimal alignment required by mm core */ if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES)) return -EINVAL; /* * Each reserved area must be initialised later, when more kernel * subsystems (like slab allocator) are available. */ cma = &cma_areas[cma_area_count]; if (name) snprintf(cma->name, CMA_MAX_NAME, name); else snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count); cma->base_pfn = PFN_DOWN(base); cma->count = size >> PAGE_SHIFT; cma->order_per_bit = order_per_bit; *res_cma = cma; cma_area_count++; totalcma_pages += cma->count; return 0; } /** * cma_declare_contiguous_nid() - reserve custom contiguous area * @base: Base address of the reserved area optional, use 0 for any * @size: Size of the reserved area (in bytes), * @limit: End address of the reserved memory (optional, 0 for any). * @alignment: Alignment for the CMA area, should be power of 2 or zero * @order_per_bit: Order of pages represented by one bit on bitmap. * @fixed: hint about where to place the reserved area * @name: The name of the area. See function cma_init_reserved_mem() * @res_cma: Pointer to store the created cma region. * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * This function reserves memory from early allocator. It should be * called by arch specific code once the early allocator (memblock or bootmem) * has been activated and all other subsystems have already allocated/reserved * memory. This function allows to create custom reserved areas. * * If @fixed is true, reserve contiguous area at exactly @base. If false, * reserve in range from @base to @limit. */ int __init cma_declare_contiguous_nid(phys_addr_t base, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, bool fixed, const char *name, struct cma **res_cma, int nid) { phys_addr_t memblock_end = memblock_end_of_DRAM(); phys_addr_t highmem_start; int ret; /* * We can't use __pa(high_memory) directly, since high_memory * isn't a valid direct map VA, and DEBUG_VIRTUAL will (validly) * complain. Find the boundary by adding one to the last valid * address. */ highmem_start = __pa(high_memory - 1) + 1; pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", __func__, &size, &base, &limit, &alignment); if (cma_area_count == ARRAY_SIZE(cma_areas)) { pr_err("Not enough slots for CMA reserved regions!\n"); return -ENOSPC; } if (!size) return -EINVAL; if (alignment && !is_power_of_2(alignment)) return -EINVAL; if (!IS_ENABLED(CONFIG_NUMA)) nid = NUMA_NO_NODE; /* Sanitise input arguments. */ alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); if (fixed && base & (alignment - 1)) { ret = -EINVAL; pr_err("Region at %pa must be aligned to %pa bytes\n", &base, &alignment); goto err; } base = ALIGN(base, alignment); size = ALIGN(size, alignment); limit &= ~(alignment - 1); if (!base) fixed = false; /* size should be aligned with order_per_bit */ if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) return -EINVAL; /* * If allocating at a fixed base the request region must not cross the * low/high memory boundary. */ if (fixed && base < highmem_start && base + size > highmem_start) { ret = -EINVAL; pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", &base, &highmem_start); goto err; } /* * If the limit is unspecified or above the memblock end, its effective * value will be the memblock end. Set it explicitly to simplify further * checks. */ if (limit == 0 || limit > memblock_end) limit = memblock_end; if (base + size > limit) { ret = -EINVAL; pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n", &size, &base, &limit); goto err; } /* Reserve memory */ if (fixed) { if (memblock_is_region_reserved(base, size) || memblock_reserve(base, size) < 0) { ret = -EBUSY; goto err; } } else { phys_addr_t addr = 0; /* * If there is enough memory, try a bottom-up allocation first. * It will place the new cma area close to the start of the node * and guarantee that the compaction is moving pages out of the * cma area and not into it. * Avoid using first 4GB to not interfere with constrained zones * like DMA/DMA32. */ #ifdef CONFIG_PHYS_ADDR_T_64BIT if (!memblock_bottom_up() && memblock_end >= SZ_4G + size) { memblock_set_bottom_up(true); addr = memblock_alloc_range_nid(size, alignment, SZ_4G, limit, nid, true); memblock_set_bottom_up(false); } #endif /* * All pages in the reserved area must come from the same zone. * If the requested region crosses the low/high memory boundary, * try allocating from high memory first and fall back to low * memory in case of failure. */ if (!addr && base < highmem_start && limit > highmem_start) { addr = memblock_alloc_range_nid(size, alignment, highmem_start, limit, nid, true); limit = highmem_start; } if (!addr) { addr = memblock_alloc_range_nid(size, alignment, base, limit, nid, true); if (!addr) { ret = -ENOMEM; goto err; } } /* * kmemleak scans/reads tracked objects for pointers to other * objects but this address isn't mapped and accessible */ kmemleak_ignore_phys(addr); base = addr; } ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); if (ret) goto free_mem; pr_info("Reserved %ld MiB at %pa on node %d\n", (unsigned long)size / SZ_1M, &base, nid); return 0; free_mem: memblock_phys_free(base, size); err: pr_err("Failed to reserve %ld MiB on node %d\n", (unsigned long)size / SZ_1M, nid); return ret; } static void cma_debug_show_areas(struct cma *cma) { unsigned long next_zero_bit, next_set_bit, nr_zero; unsigned long start = 0; unsigned long nr_part, nr_total = 0; unsigned long nbits = cma_bitmap_maxno(cma); spin_lock_irq(&cma->lock); pr_info("number of available pages: "); for (;;) { next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start); if (next_zero_bit >= nbits) break; next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit); nr_zero = next_set_bit - next_zero_bit; nr_part = nr_zero << cma->order_per_bit; pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part, next_zero_bit); nr_total += nr_part; start = next_zero_bit + nr_zero; } pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); spin_unlock_irq(&cma->lock); } static struct page *__cma_alloc(struct cma *cma, unsigned long count, unsigned int align, gfp_t gfp) { unsigned long mask, offset; unsigned long pfn = -1; unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; unsigned long i; struct page *page = NULL; int ret = -ENOMEM; const char *name = cma ? cma->name : NULL; trace_cma_alloc_start(name, count, align); if (!cma || !cma->count || !cma->bitmap) return page; pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__, (void *)cma, cma->name, count, align); if (!count) return page; mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); bitmap_maxno = cma_bitmap_maxno(cma); bitmap_count = cma_bitmap_pages_to_bits(cma, count); if (bitmap_count > bitmap_maxno) return page; for (;;) { spin_lock_irq(&cma->lock); bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, bitmap_maxno, start, bitmap_count, mask, offset); if (bitmap_no >= bitmap_maxno) { spin_unlock_irq(&cma->lock); break; } bitmap_set(cma->bitmap, bitmap_no, bitmap_count); /* * It's safe to drop the lock here. We've marked this region for * our exclusive use. If the migration fails we will take the * lock again and unmark it. */ spin_unlock_irq(&cma->lock); pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp); mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); break; } cma_clear_bitmap(cma, pfn, count); if (ret != -EBUSY) break; pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n", __func__, pfn, pfn_to_page(pfn)); trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), count, align); /* try again with a bit different memory target */ start = bitmap_no + mask + 1; } /* * CMA can allocate multiple page blocks, which results in different * blocks being marked with different tags. Reset the tags to ignore * those page blocks. */ if (page) { for (i = 0; i < count; i++) page_kasan_tag_reset(nth_page(page, i)); } if (ret && !(gfp & __GFP_NOWARN)) { pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n", __func__, cma->name, count, ret); cma_debug_show_areas(cma); } pr_debug("%s(): returned %p\n", __func__, page); trace_cma_alloc_finish(name, pfn, page, count, align, ret); if (page) { count_vm_event(CMA_ALLOC_SUCCESS); cma_sysfs_account_success_pages(cma, count); } else { count_vm_event(CMA_ALLOC_FAIL); cma_sysfs_account_fail_pages(cma, count); } return page; } /** * cma_alloc() - allocate pages from contiguous area * @cma: Contiguous memory region for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). * @no_warn: Avoid printing message about failed allocation * * This function allocates part of contiguous memory on specific * contiguous memory area. */ struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn) { return __cma_alloc(cma, count, align, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); } struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) { struct page *page; if (WARN_ON(!order || !(gfp & __GFP_COMP))) return NULL; page = __cma_alloc(cma, 1 << order, order, gfp); return page ? page_folio(page) : NULL; } bool cma_pages_valid(struct cma *cma, const struct page *pages, unsigned long count) { unsigned long pfn; if (!cma || !pages) return false; pfn = page_to_pfn(pages); if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) { pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); return false; } return true; } /** * cma_release() - release allocated pages * @cma: Contiguous memory region for which the allocation is performed. * @pages: Allocated pages. * @count: Number of allocated pages. * * This function releases memory allocated by cma_alloc(). * It returns false when provided pages do not belong to contiguous area and * true otherwise. */ bool cma_release(struct cma *cma, const struct page *pages, unsigned long count) { unsigned long pfn; if (!cma_pages_valid(cma, pages, count)) return false; pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); pfn = page_to_pfn(pages); VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); free_contig_range(pfn, count); cma_clear_bitmap(cma, pfn, count); cma_sysfs_account_release_pages(cma, count); trace_cma_release(cma->name, pfn, pages, count); return true; } bool cma_free_folio(struct cma *cma, const struct folio *folio) { if (WARN_ON(!folio_test_large(folio))) return false; return cma_release(cma, &folio->page, folio_nr_pages(folio)); } int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) { int i; for (i = 0; i < cma_area_count; i++) { int ret = it(&cma_areas[i], data); if (ret) return ret; } return 0; }
5 12 9 9 9 5 8 9 9 4 5 9 5 8 5 5 8 3 5 2 2 5 5 5 2 5 5 3 3 3 3 6 5 5 5 9 9 6 5 6 6 9 9 9 4 4 9 8 5 4 5 9 3 3 3 3 9 9 9 9 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/if_vlan.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/netdevice.h> #include <net/addrconf.h> #include <rdma/ib_cache.h> #include "core_priv.h" struct ib_pkey_cache { int table_len; u16 table[] __counted_by(table_len); }; struct ib_update_work { struct work_struct work; struct ib_event event; bool enforce_security; }; union ib_gid zgid; EXPORT_SYMBOL(zgid); enum gid_attr_find_mask { GID_ATTR_FIND_MASK_GID = 1UL << 0, GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2, GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, }; enum gid_table_entry_state { GID_TABLE_ENTRY_INVALID = 1, GID_TABLE_ENTRY_VALID = 2, /* * Indicates that entry is pending to be removed, there may * be active users of this GID entry. * When last user of the GID entry releases reference to it, * GID entry is detached from the table. */ GID_TABLE_ENTRY_PENDING_DEL = 3, }; struct roce_gid_ndev_storage { struct rcu_head rcu_head; struct net_device *ndev; }; struct ib_gid_table_entry { struct kref kref; struct work_struct del_work; struct ib_gid_attr attr; void *context; /* Store the ndev pointer to release reference later on in * call_rcu context because by that time gid_table_entry * and attr might be already freed. So keep a copy of it. * ndev_storage is freed by rcu callback. */ struct roce_gid_ndev_storage *ndev_storage; enum gid_table_entry_state state; }; struct ib_gid_table { int sz; /* In RoCE, adding a GID to the table requires: * (a) Find if this GID is already exists. * (b) Find a free space. * (c) Write the new GID * * Delete requires different set of operations: * (a) Find the GID * (b) Delete it. * **/ /* Any writer to data_vec must hold this lock and the write side of * rwlock. Readers must hold only rwlock. All writers must be in a * sleepable context. */ struct mutex lock; /* rwlock protects data_vec[ix]->state and entry pointer. */ rwlock_t rwlock; struct ib_gid_table_entry **data_vec; /* bit field, each bit indicates the index of default GID */ u32 default_gid_indices; }; static void dispatch_gid_change_event(struct ib_device *ib_dev, u32 port) { struct ib_event event; event.device = ib_dev; event.element.port_num = port; event.event = IB_EVENT_GID_CHANGE; ib_dispatch_event_clients(&event); } static const char * const gid_type_str[] = { /* IB/RoCE v1 value is set for IB_GID_TYPE_IB and IB_GID_TYPE_ROCE for * user space compatibility reasons. */ [IB_GID_TYPE_IB] = "IB/RoCE v1", [IB_GID_TYPE_ROCE] = "IB/RoCE v1", [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", }; const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) { if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type]) return gid_type_str[gid_type]; return "Invalid GID type"; } EXPORT_SYMBOL(ib_cache_gid_type_str); /** rdma_is_zero_gid - Check if given GID is zero or not. * @gid: GID to check * Returns true if given GID is zero, returns false otherwise. */ bool rdma_is_zero_gid(const union ib_gid *gid) { return !memcmp(gid, &zgid, sizeof(*gid)); } EXPORT_SYMBOL(rdma_is_zero_gid); /** is_gid_index_default - Check if a given index belongs to * reserved default GIDs or not. * @table: GID table pointer * @index: Index to check in GID table * Returns true if index is one of the reserved default GID index otherwise * returns false. */ static bool is_gid_index_default(const struct ib_gid_table *table, unsigned int index) { return index < 32 && (BIT(index) & table->default_gid_indices); } int ib_cache_gid_parse_type_str(const char *buf) { unsigned int i; size_t len; int err = -EINVAL; len = strlen(buf); if (len == 0) return -EINVAL; if (buf[len - 1] == '\n') len--; for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i) if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) && len == strlen(gid_type_str[i])) { err = i; break; } return err; } EXPORT_SYMBOL(ib_cache_gid_parse_type_str); static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u32 port) { return device->port_data[port].cache.gid; } static bool is_gid_entry_free(const struct ib_gid_table_entry *entry) { return !entry; } static bool is_gid_entry_valid(const struct ib_gid_table_entry *entry) { return entry && entry->state == GID_TABLE_ENTRY_VALID; } static void schedule_free_gid(struct kref *kref) { struct ib_gid_table_entry *entry = container_of(kref, struct ib_gid_table_entry, kref); queue_work(ib_wq, &entry->del_work); } static void put_gid_ndev(struct rcu_head *head) { struct roce_gid_ndev_storage *storage = container_of(head, struct roce_gid_ndev_storage, rcu_head); WARN_ON(!storage->ndev); /* At this point its safe to release netdev reference, * as all callers working on gid_attr->ndev are done * using this netdev. */ dev_put(storage->ndev); kfree(storage); } static void free_gid_entry_locked(struct ib_gid_table_entry *entry) { struct ib_device *device = entry->attr.device; u32 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); dev_dbg(&device->dev, "%s port=%u index=%u gid %pI6\n", __func__, port_num, entry->attr.index, entry->attr.gid.raw); write_lock_irq(&table->rwlock); /* * The only way to avoid overwriting NULL in table is * by comparing if it is same entry in table or not! * If new entry in table is added by the time we free here, * don't overwrite the table entry. */ if (entry == table->data_vec[entry->attr.index]) table->data_vec[entry->attr.index] = NULL; /* Now this index is ready to be allocated */ write_unlock_irq(&table->rwlock); if (entry->ndev_storage) call_rcu(&entry->ndev_storage->rcu_head, put_gid_ndev); kfree(entry); } static void free_gid_entry(struct kref *kref) { struct ib_gid_table_entry *entry = container_of(kref, struct ib_gid_table_entry, kref); free_gid_entry_locked(entry); } /** * free_gid_work - Release reference to the GID entry * @work: Work structure to refer to GID entry which needs to be * deleted. * * free_gid_work() frees the entry from the HCA's hardware table * if provider supports it. It releases reference to netdevice. */ static void free_gid_work(struct work_struct *work) { struct ib_gid_table_entry *entry = container_of(work, struct ib_gid_table_entry, del_work); struct ib_device *device = entry->attr.device; u32 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); mutex_lock(&table->lock); free_gid_entry_locked(entry); mutex_unlock(&table->lock); } static struct ib_gid_table_entry * alloc_gid_entry(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry; struct net_device *ndev; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return NULL; ndev = rcu_dereference_protected(attr->ndev, 1); if (ndev) { entry->ndev_storage = kzalloc(sizeof(*entry->ndev_storage), GFP_KERNEL); if (!entry->ndev_storage) { kfree(entry); return NULL; } dev_hold(ndev); entry->ndev_storage->ndev = ndev; } kref_init(&entry->kref); memcpy(&entry->attr, attr, sizeof(*attr)); INIT_WORK(&entry->del_work, free_gid_work); entry->state = GID_TABLE_ENTRY_INVALID; return entry; } static void store_gid_entry(struct ib_gid_table *table, struct ib_gid_table_entry *entry) { entry->state = GID_TABLE_ENTRY_VALID; dev_dbg(&entry->attr.device->dev, "%s port=%u index=%u gid %pI6\n", __func__, entry->attr.port_num, entry->attr.index, entry->attr.gid.raw); lockdep_assert_held(&table->lock); write_lock_irq(&table->rwlock); table->data_vec[entry->attr.index] = entry; write_unlock_irq(&table->rwlock); } static void get_gid_entry(struct ib_gid_table_entry *entry) { kref_get(&entry->kref); } static void put_gid_entry(struct ib_gid_table_entry *entry) { kref_put(&entry->kref, schedule_free_gid); } static void put_gid_entry_locked(struct ib_gid_table_entry *entry) { kref_put(&entry->kref, free_gid_entry); } static int add_roce_gid(struct ib_gid_table_entry *entry) { const struct ib_gid_attr *attr = &entry->attr; int ret; if (!attr->ndev) { dev_err(&attr->device->dev, "%s NULL netdev port=%u index=%u\n", __func__, attr->port_num, attr->index); return -EINVAL; } if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { ret = attr->device->ops.add_gid(attr, &entry->context); if (ret) { dev_err(&attr->device->dev, "%s GID add failed port=%u index=%u\n", __func__, attr->port_num, attr->index); return ret; } } return 0; } /** * del_gid - Delete GID table entry * * @ib_dev: IB device whose GID entry to be deleted * @port: Port number of the IB device * @table: GID table of the IB device for a port * @ix: GID entry index to delete * */ static void del_gid(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table, int ix) { struct roce_gid_ndev_storage *ndev_storage; struct ib_gid_table_entry *entry; lockdep_assert_held(&table->lock); dev_dbg(&ib_dev->dev, "%s port=%u index=%d gid %pI6\n", __func__, port, ix, table->data_vec[ix]->attr.gid.raw); write_lock_irq(&table->rwlock); entry = table->data_vec[ix]; entry->state = GID_TABLE_ENTRY_PENDING_DEL; /* * For non RoCE protocol, GID entry slot is ready to use. */ if (!rdma_protocol_roce(ib_dev, port)) table->data_vec[ix] = NULL; write_unlock_irq(&table->rwlock); if (rdma_cap_roce_gid_table(ib_dev, port)) ib_dev->ops.del_gid(&entry->attr, &entry->context); ndev_storage = entry->ndev_storage; if (ndev_storage) { entry->ndev_storage = NULL; rcu_assign_pointer(entry->attr.ndev, NULL); call_rcu(&ndev_storage->rcu_head, put_gid_ndev); } put_gid_entry_locked(entry); } /** * add_modify_gid - Add or modify GID table entry * * @table: GID table in which GID to be added or modified * @attr: Attributes of the GID * * Returns 0 on success or appropriate error code. It accepts zero * GID addition for non RoCE ports for HCA's who report them as valid * GID. However such zero GIDs are not added to the cache. */ static int add_modify_gid(struct ib_gid_table *table, const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry; int ret = 0; /* * Invalidate any old entry in the table to make it safe to write to * this index. */ if (is_gid_entry_valid(table->data_vec[attr->index])) del_gid(attr->device, attr->port_num, table, attr->index); /* * Some HCA's report multiple GID entries with only one valid GID, and * leave other unused entries as the zero GID. Convert zero GIDs to * empty table entries instead of storing them. */ if (rdma_is_zero_gid(&attr->gid)) return 0; entry = alloc_gid_entry(attr); if (!entry) return -ENOMEM; if (rdma_protocol_roce(attr->device, attr->port_num)) { ret = add_roce_gid(entry); if (ret) goto done; } store_gid_entry(table, entry); return 0; done: put_gid_entry(entry); return ret; } /* rwlock should be read locked, or lock should be held */ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, const struct ib_gid_attr *val, bool default_gid, unsigned long mask, int *pempty) { int i = 0; int found = -1; int empty = pempty ? -1 : 0; while (i < table->sz && (found < 0 || empty < 0)) { struct ib_gid_table_entry *data = table->data_vec[i]; struct ib_gid_attr *attr; int curr_index = i; i++; /* find_gid() is used during GID addition where it is expected * to return a free entry slot which is not duplicate. * Free entry slot is requested and returned if pempty is set, * so lookup free slot only if requested. */ if (pempty && empty < 0) { if (is_gid_entry_free(data) && default_gid == is_gid_index_default(table, curr_index)) { /* * Found an invalid (free) entry; allocate it. * If default GID is requested, then our * found slot must be one of the DEFAULT * reserved slots or we fail. * This ensures that only DEFAULT reserved * slots are used for default property GIDs. */ empty = curr_index; } } /* * Additionally find_gid() is used to find valid entry during * lookup operation; so ignore the entries which are marked as * pending for removal and the entries which are marked as * invalid. */ if (!is_gid_entry_valid(data)) continue; if (found >= 0) continue; attr = &data->attr; if (mask & GID_ATTR_FIND_MASK_GID_TYPE && attr->gid_type != val->gid_type) continue; if (mask & GID_ATTR_FIND_MASK_GID && memcmp(gid, &data->attr.gid, sizeof(*gid))) continue; if (mask & GID_ATTR_FIND_MASK_NETDEV && attr->ndev != val->ndev) continue; if (mask & GID_ATTR_FIND_MASK_DEFAULT && is_gid_index_default(table, curr_index) != default_gid) continue; found = curr_index; } if (pempty) *pempty = empty; return found; } static void make_default_gid(struct net_device *dev, union ib_gid *gid) { gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); addrconf_ifid_eui48(&gid->raw[8], dev); } static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr, unsigned long mask, bool default_gid) { struct ib_gid_table *table; int ret = 0; int empty; int ix; /* Do not allow adding zero GID in support of * IB spec version 1.3 section 4.1.1 point (6) and * section 12.7.10 and section 12.7.20 */ if (rdma_is_zero_gid(gid)) return -EINVAL; table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); ix = find_gid(table, gid, attr, default_gid, mask, &empty); if (ix >= 0) goto out_unlock; if (empty < 0) { ret = -ENOSPC; goto out_unlock; } attr->device = ib_dev; attr->index = empty; attr->port_num = port; attr->gid = *gid; ret = add_modify_gid(table, attr); if (!ret) dispatch_gid_change_event(ib_dev, port); out_unlock: mutex_unlock(&table->lock); if (ret) pr_warn("%s: unable to add gid %pI6 error=%d\n", __func__, gid->raw, ret); return ret; } int ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr) { unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_NETDEV; return __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false); } static int _ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr, unsigned long mask, bool default_gid) { struct ib_gid_table *table; int ret = 0; int ix; table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); ix = find_gid(table, gid, attr, default_gid, mask, NULL); if (ix < 0) { ret = -EINVAL; goto out_unlock; } del_gid(ib_dev, port, table, ix); dispatch_gid_change_event(ib_dev, port); out_unlock: mutex_unlock(&table->lock); if (ret) pr_debug("%s: can't delete gid %pI6 error=%d\n", __func__, gid->raw, ret); return ret; } int ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr) { unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_DEFAULT | GID_ATTR_FIND_MASK_NETDEV; return _ib_cache_gid_del(ib_dev, port, gid, attr, mask, false); } int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct ib_gid_table *table; int ix; bool deleted = false; table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); for (ix = 0; ix < table->sz; ix++) { if (is_gid_entry_valid(table->data_vec[ix]) && table->data_vec[ix]->attr.ndev == ndev) { del_gid(ib_dev, port, table, ix); deleted = true; } } mutex_unlock(&table->lock); if (deleted) dispatch_gid_change_event(ib_dev, port); return 0; } /** * rdma_find_gid_by_port - Returns the GID entry attributes when it finds * a valid GID entry for given search parameters. It searches for the specified * GID value in the local software cache. * @ib_dev: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. * @port: The port number of the device where the GID value should be searched. * @ndev: In RoCE, the net device of the device. NULL means ignore. * * Returns sgid attributes if the GID is found with valid reference or * returns ERR_PTR for the error. * The caller must invoke rdma_put_gid_attr() to release the reference. */ const struct ib_gid_attr * rdma_find_gid_by_port(struct ib_device *ib_dev, const union ib_gid *gid, enum ib_gid_type gid_type, u32 port, struct net_device *ndev) { int local_index; struct ib_gid_table *table; unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; const struct ib_gid_attr *attr; unsigned long flags; if (!rdma_is_port_valid(ib_dev, port)) return ERR_PTR(-ENOENT); table = rdma_gid_table(ib_dev, port); if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; read_lock_irqsave(&table->rwlock, flags); local_index = find_gid(table, gid, &val, false, mask, NULL); if (local_index >= 0) { get_gid_entry(table->data_vec[local_index]); attr = &table->data_vec[local_index]->attr; read_unlock_irqrestore(&table->rwlock, flags); return attr; } read_unlock_irqrestore(&table->rwlock, flags); return ERR_PTR(-ENOENT); } EXPORT_SYMBOL(rdma_find_gid_by_port); /** * rdma_find_gid_by_filter - Returns the GID table attribute where a * specified GID value occurs * @ib_dev: The device to query. * @gid: The GID value to search for. * @port: The port number of the device where the GID value could be * searched. * @filter: The filter function is executed on any matching GID in the table. * If the filter function returns true, the corresponding index is returned, * otherwise, we continue searching the GID table. It's guaranteed that * while filter is executed, ndev field is valid and the structure won't * change. filter is executed in an atomic context. filter must not be NULL. * @context: Private data to pass into the call-back. * * rdma_find_gid_by_filter() searches for the specified GID value * of which the filter function returns true in the port's GID table. * */ const struct ib_gid_attr *rdma_find_gid_by_filter( struct ib_device *ib_dev, const union ib_gid *gid, u32 port, bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *, void *), void *context) { const struct ib_gid_attr *res = ERR_PTR(-ENOENT); struct ib_gid_table *table; unsigned long flags; unsigned int i; if (!rdma_is_port_valid(ib_dev, port)) return ERR_PTR(-EINVAL); table = rdma_gid_table(ib_dev, port); read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { struct ib_gid_table_entry *entry = table->data_vec[i]; if (!is_gid_entry_valid(entry)) continue; if (memcmp(gid, &entry->attr.gid, sizeof(*gid))) continue; if (filter(gid, &entry->attr, context)) { get_gid_entry(entry); res = &entry->attr; break; } } read_unlock_irqrestore(&table->rwlock, flags); return res; } static struct ib_gid_table *alloc_gid_table(int sz) { struct ib_gid_table *table = kzalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL); if (!table->data_vec) goto err_free_table; mutex_init(&table->lock); table->sz = sz; rwlock_init(&table->rwlock); return table; err_free_table: kfree(table); return NULL; } static void release_gid_table(struct ib_device *device, struct ib_gid_table *table) { int i; if (!table) return; for (i = 0; i < table->sz; i++) { if (is_gid_entry_free(table->data_vec[i])) continue; WARN_ONCE(true, "GID entry ref leak for dev %s index %d ref=%u\n", dev_name(&device->dev), i, kref_read(&table->data_vec[i]->kref)); } mutex_destroy(&table->lock); kfree(table->data_vec); kfree(table); } static void cleanup_gid_table_port(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table) { int i; if (!table) return; mutex_lock(&table->lock); for (i = 0; i < table->sz; ++i) { if (is_gid_entry_valid(table->data_vec[i])) del_gid(ib_dev, port, table, i); } mutex_unlock(&table->lock); } void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port, struct net_device *ndev, unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode) { union ib_gid gid = { }; struct ib_gid_attr gid_attr; unsigned int gid_type; unsigned long mask; mask = GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_DEFAULT | GID_ATTR_FIND_MASK_NETDEV; memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { if (1UL << gid_type & ~gid_type_mask) continue; gid_attr.gid_type = gid_type; if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { make_default_gid(ndev, &gid); __ib_cache_gid_add(ib_dev, port, &gid, &gid_attr, mask, true); } else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) { _ib_cache_gid_del(ib_dev, port, &gid, &gid_attr, mask, true); } } } static void gid_table_reserve_default(struct ib_device *ib_dev, u32 port, struct ib_gid_table *table) { unsigned int i; unsigned long roce_gid_type_mask; unsigned int num_default_gids; roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); num_default_gids = hweight_long(roce_gid_type_mask); /* Reserve starting indices for default GIDs */ for (i = 0; i < num_default_gids && i < table->sz; i++) table->default_gid_indices |= BIT(i); } static void gid_table_release_one(struct ib_device *ib_dev) { u32 p; rdma_for_each_port (ib_dev, p) { release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid); ib_dev->port_data[p].cache.gid = NULL; } } static int _gid_table_setup_one(struct ib_device *ib_dev) { struct ib_gid_table *table; u32 rdma_port; rdma_for_each_port (ib_dev, rdma_port) { table = alloc_gid_table( ib_dev->port_data[rdma_port].immutable.gid_tbl_len); if (!table) goto rollback_table_setup; gid_table_reserve_default(ib_dev, rdma_port, table); ib_dev->port_data[rdma_port].cache.gid = table; } return 0; rollback_table_setup: gid_table_release_one(ib_dev); return -ENOMEM; } static void gid_table_cleanup_one(struct ib_device *ib_dev) { u32 p; rdma_for_each_port (ib_dev, p) cleanup_gid_table_port(ib_dev, p, ib_dev->port_data[p].cache.gid); } static int gid_table_setup_one(struct ib_device *ib_dev) { int err; err = _gid_table_setup_one(ib_dev); if (err) return err; rdma_roce_rescan_device(ib_dev); return err; } /** * rdma_query_gid - Read the GID content from the GID software cache * @device: Device to query the GID * @port_num: Port number of the device * @index: Index of the GID table entry to read * @gid: Pointer to GID where to store the entry's GID * * rdma_query_gid() only reads the GID entry content for requested device, * port and index. It reads for IB, RoCE and iWarp link layers. It doesn't * hold any reference to the GID table entry in the HCA or software cache. * * Returns 0 on success or appropriate error code. * */ int rdma_query_gid(struct ib_device *device, u32 port_num, int index, union ib_gid *gid) { struct ib_gid_table *table; unsigned long flags; int res; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); if (index < 0 || index >= table->sz) { res = -EINVAL; goto done; } if (!is_gid_entry_valid(table->data_vec[index])) { res = -ENOENT; goto done; } memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid)); res = 0; done: read_unlock_irqrestore(&table->rwlock, flags); return res; } EXPORT_SYMBOL(rdma_query_gid); /** * rdma_read_gid_hw_context - Read the HW GID context from GID attribute * @attr: Potinter to the GID attribute * * rdma_read_gid_hw_context() reads the drivers GID HW context corresponding * to the SGID attr. Callers are required to already be holding the reference * to an existing GID entry. * * Returns the HW GID context * */ void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr) { return container_of(attr, struct ib_gid_table_entry, attr)->context; } EXPORT_SYMBOL(rdma_read_gid_hw_context); /** * rdma_find_gid - Returns SGID attributes if the matching GID is found. * @device: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. * @ndev: In RoCE, the net device of the device. NULL means ignore. * * rdma_find_gid() searches for the specified GID value in the software cache. * * Returns GID attributes if a valid GID is found or returns ERR_PTR for the * error. The caller must invoke rdma_put_gid_attr() to release the reference. * */ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, const union ib_gid *gid, enum ib_gid_type gid_type, struct net_device *ndev) { unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; u32 p; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; rdma_for_each_port(device, p) { struct ib_gid_table *table; unsigned long flags; int index; table = device->port_data[p].cache.gid; read_lock_irqsave(&table->rwlock, flags); index = find_gid(table, gid, &gid_attr_val, false, mask, NULL); if (index >= 0) { const struct ib_gid_attr *attr; get_gid_entry(table->data_vec[index]); attr = &table->data_vec[index]->attr; read_unlock_irqrestore(&table->rwlock, flags); return attr; } read_unlock_irqrestore(&table->rwlock, flags); } return ERR_PTR(-ENOENT); } EXPORT_SYMBOL(rdma_find_gid); int ib_get_cached_pkey(struct ib_device *device, u32 port_num, int index, u16 *pkey) { struct ib_pkey_cache *cache; unsigned long flags; int ret = 0; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; if (!cache || index < 0 || index >= cache->table_len) ret = -EINVAL; else *pkey = cache->table[index]; read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_pkey); void ib_get_cached_subnet_prefix(struct ib_device *device, u32 port_num, u64 *sn_pfx) { unsigned long flags; read_lock_irqsave(&device->cache_lock, flags); *sn_pfx = device->port_data[port_num].cache.subnet_prefix; read_unlock_irqrestore(&device->cache_lock, flags); } EXPORT_SYMBOL(ib_get_cached_subnet_prefix); int ib_find_cached_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; int i; int ret = -ENOENT; int partial_ix = -1; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; if (!cache) { ret = -EINVAL; goto err; } *index = -1; for (i = 0; i < cache->table_len; ++i) if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { if (cache->table[i] & 0x8000) { *index = i; ret = 0; break; } else { partial_ix = i; } } if (ret && partial_ix >= 0) { *index = partial_ix; ret = 0; } err: read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_find_cached_pkey); int ib_find_exact_cached_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { struct ib_pkey_cache *cache; unsigned long flags; int i; int ret = -ENOENT; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); cache = device->port_data[port_num].cache.pkey; if (!cache) { ret = -EINVAL; goto err; } *index = -1; for (i = 0; i < cache->table_len; ++i) if (cache->table[i] == pkey) { *index = i; ret = 0; break; } err: read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_find_exact_cached_pkey); int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc) { unsigned long flags; int ret = 0; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); *lmc = device->port_data[port_num].cache.lmc; read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_lmc); int ib_get_cached_port_state(struct ib_device *device, u32 port_num, enum ib_port_state *port_state) { unsigned long flags; int ret = 0; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache_lock, flags); *port_state = device->port_data[port_num].cache.port_state; read_unlock_irqrestore(&device->cache_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_port_state); /** * rdma_get_gid_attr - Returns GID attributes for a port of a device * at a requested gid_index, if a valid GID entry exists. * @device: The device to query. * @port_num: The port number on the device where the GID value * is to be queried. * @index: Index of the GID table entry whose attributes are to * be queried. * * rdma_get_gid_attr() acquires reference count of gid attributes from the * cached GID table. Caller must invoke rdma_put_gid_attr() to release * reference to gid attribute regardless of link layer. * * Returns pointer to valid gid attribute or ERR_PTR for the appropriate error * code. */ const struct ib_gid_attr * rdma_get_gid_attr(struct ib_device *device, u32 port_num, int index) { const struct ib_gid_attr *attr = ERR_PTR(-ENODATA); struct ib_gid_table *table; unsigned long flags; if (!rdma_is_port_valid(device, port_num)) return ERR_PTR(-EINVAL); table = rdma_gid_table(device, port_num); if (index < 0 || index >= table->sz) return ERR_PTR(-EINVAL); read_lock_irqsave(&table->rwlock, flags); if (!is_gid_entry_valid(table->data_vec[index])) goto done; get_gid_entry(table->data_vec[index]); attr = &table->data_vec[index]->attr; done: read_unlock_irqrestore(&table->rwlock, flags); return attr; } EXPORT_SYMBOL(rdma_get_gid_attr); /** * rdma_query_gid_table - Reads GID table entries of all the ports of a device up to max_entries. * @device: The device to query. * @entries: Entries where GID entries are returned. * @max_entries: Maximum number of entries that can be returned. * Entries array must be allocated to hold max_entries number of entries. * * Returns number of entries on success or appropriate error code. */ ssize_t rdma_query_gid_table(struct ib_device *device, struct ib_uverbs_gid_entry *entries, size_t max_entries) { const struct ib_gid_attr *gid_attr; ssize_t num_entries = 0, ret; struct ib_gid_table *table; u32 port_num, i; struct net_device *ndev; unsigned long flags; rdma_for_each_port(device, port_num) { table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { if (!is_gid_entry_valid(table->data_vec[i])) continue; if (num_entries >= max_entries) { ret = -EINVAL; goto err; } gid_attr = &table->data_vec[i]->attr; memcpy(&entries->gid, &gid_attr->gid, sizeof(gid_attr->gid)); entries->gid_index = gid_attr->index; entries->port_num = gid_attr->port_num; entries->gid_type = gid_attr->gid_type; ndev = rcu_dereference_protected( gid_attr->ndev, lockdep_is_held(&table->rwlock)); if (ndev) entries->netdev_ifindex = ndev->ifindex; num_entries++; entries++; } read_unlock_irqrestore(&table->rwlock, flags); } return num_entries; err: read_unlock_irqrestore(&table->rwlock, flags); return ret; } EXPORT_SYMBOL(rdma_query_gid_table); /** * rdma_put_gid_attr - Release reference to the GID attribute * @attr: Pointer to the GID attribute whose reference * needs to be released. * * rdma_put_gid_attr() must be used to release reference whose * reference is acquired using rdma_get_gid_attr() or any APIs * which returns a pointer to the ib_gid_attr regardless of link layer * of IB or RoCE. * */ void rdma_put_gid_attr(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); put_gid_entry(entry); } EXPORT_SYMBOL(rdma_put_gid_attr); /** * rdma_hold_gid_attr - Get reference to existing GID attribute * * @attr: Pointer to the GID attribute whose reference * needs to be taken. * * Increase the reference count to a GID attribute to keep it from being * freed. Callers are required to already be holding a reference to attribute. * */ void rdma_hold_gid_attr(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); get_gid_entry(entry); } EXPORT_SYMBOL(rdma_hold_gid_attr); /** * rdma_read_gid_attr_ndev_rcu - Read GID attribute netdevice * which must be in UP state. * * @attr:Pointer to the GID attribute * * Returns pointer to netdevice if the netdevice was attached to GID and * netdevice is in UP state. Caller must hold RCU lock as this API * reads the netdev flags which can change while netdevice migrates to * different net namespace. Returns ERR_PTR with error code otherwise. * */ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry = container_of(attr, struct ib_gid_table_entry, attr); struct ib_device *device = entry->attr.device; struct net_device *ndev = ERR_PTR(-EINVAL); u32 port_num = entry->attr.port_num; struct ib_gid_table *table; unsigned long flags; bool valid; table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); valid = is_gid_entry_valid(table->data_vec[attr->index]); if (valid) { ndev = rcu_dereference(attr->ndev); if (!ndev) ndev = ERR_PTR(-ENODEV); } read_unlock_irqrestore(&table->rwlock, flags); return ndev; } EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu); static int get_lower_dev_vlan(struct net_device *lower_dev, struct netdev_nested_priv *priv) { u16 *vlan_id = (u16 *)priv->data; if (is_vlan_dev(lower_dev)) *vlan_id = vlan_dev_vlan_id(lower_dev); /* We are interested only in first level vlan device, so * always return 1 to stop iterating over next level devices. */ return 1; } /** * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address * of a GID entry. * * @attr: GID attribute pointer whose L2 fields to be read * @vlan_id: Pointer to vlan id to fill up if the GID entry has * vlan id. It is optional. * @smac: Pointer to smac to fill up for a GID entry. It is optional. * * rdma_read_gid_l2_fields() returns 0 on success and returns vlan id * (if gid entry has vlan) and source MAC, or returns error. */ int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, u16 *vlan_id, u8 *smac) { struct netdev_nested_priv priv = { .data = (void *)vlan_id, }; struct net_device *ndev; rcu_read_lock(); ndev = rcu_dereference(attr->ndev); if (!ndev) { rcu_read_unlock(); return -ENODEV; } if (smac) ether_addr_copy(smac, ndev->dev_addr); if (vlan_id) { *vlan_id = 0xffff; if (is_vlan_dev(ndev)) { *vlan_id = vlan_dev_vlan_id(ndev); } else { /* If the netdev is upper device and if it's lower * device is vlan device, consider vlan id of * the lower vlan device for this gid entry. */ netdev_walk_all_lower_dev_rcu(attr->ndev, get_lower_dev_vlan, &priv); } } rcu_read_unlock(); return 0; } EXPORT_SYMBOL(rdma_read_gid_l2_fields); static int config_non_roce_gid_cache(struct ib_device *device, u32 port, struct ib_port_attr *tprops) { struct ib_gid_attr gid_attr = {}; struct ib_gid_table *table; int ret = 0; int i; gid_attr.device = device; gid_attr.port_num = port; table = rdma_gid_table(device, port); mutex_lock(&table->lock); for (i = 0; i < tprops->gid_tbl_len; ++i) { if (!device->ops.query_gid) continue; ret = device->ops.query_gid(device, port, i, &gid_attr.gid); if (ret) { dev_warn(&device->dev, "query_gid failed (%d) for index %d\n", ret, i); goto err; } if (rdma_protocol_iwarp(device, port)) { struct net_device *ndev; ndev = ib_device_get_netdev(device, port); if (!ndev) continue; RCU_INIT_POINTER(gid_attr.ndev, ndev); dev_put(ndev); } gid_attr.index = i; tprops->subnet_prefix = be64_to_cpu(gid_attr.gid.global.subnet_prefix); add_modify_gid(table, &gid_attr); } err: mutex_unlock(&table->lock); return ret; } static int ib_cache_update(struct ib_device *device, u32 port, bool update_gids, bool update_pkeys, bool enforce_security) { struct ib_port_attr *tprops = NULL; struct ib_pkey_cache *pkey_cache = NULL; struct ib_pkey_cache *old_pkey_cache = NULL; int i; int ret; if (!rdma_is_port_valid(device, port)) return -EINVAL; tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) return -ENOMEM; ret = ib_query_port(device, port, tprops); if (ret) { dev_warn(&device->dev, "ib_query_port failed (%d)\n", ret); goto err; } if (!rdma_protocol_roce(device, port) && update_gids) { ret = config_non_roce_gid_cache(device, port, tprops); if (ret) goto err; } update_pkeys &= !!tprops->pkey_tbl_len; if (update_pkeys) { pkey_cache = kmalloc(struct_size(pkey_cache, table, tprops->pkey_tbl_len), GFP_KERNEL); if (!pkey_cache) { ret = -ENOMEM; goto err; } pkey_cache->table_len = tprops->pkey_tbl_len; for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); if (ret) { dev_warn(&device->dev, "ib_query_pkey failed (%d) for index %d\n", ret, i); goto err; } } } write_lock_irq(&device->cache_lock); if (update_pkeys) { old_pkey_cache = device->port_data[port].cache.pkey; device->port_data[port].cache.pkey = pkey_cache; } device->port_data[port].cache.lmc = tprops->lmc; device->port_data[port].cache.port_state = tprops->state; device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix; write_unlock_irq(&device->cache_lock); if (enforce_security) ib_security_cache_change(device, port, tprops->subnet_prefix); kfree(old_pkey_cache); kfree(tprops); return 0; err: kfree(pkey_cache); kfree(tprops); return ret; } static void ib_cache_event_task(struct work_struct *_work) { struct ib_update_work *work = container_of(_work, struct ib_update_work, work); int ret; /* Before distributing the cache update event, first sync * the cache. */ ret = ib_cache_update(work->event.device, work->event.element.port_num, work->event.event == IB_EVENT_GID_CHANGE, work->event.event == IB_EVENT_PKEY_CHANGE, work->enforce_security); /* GID event is notified already for individual GID entries by * dispatch_gid_change_event(). Hence, notifiy for rest of the * events. */ if (!ret && work->event.event != IB_EVENT_GID_CHANGE) ib_dispatch_event_clients(&work->event); kfree(work); } static void ib_generic_event_task(struct work_struct *_work) { struct ib_update_work *work = container_of(_work, struct ib_update_work, work); ib_dispatch_event_clients(&work->event); kfree(work); } static bool is_cache_update_event(const struct ib_event *event) { return (event->event == IB_EVENT_PORT_ERR || event->event == IB_EVENT_PORT_ACTIVE || event->event == IB_EVENT_LID_CHANGE || event->event == IB_EVENT_PKEY_CHANGE || event->event == IB_EVENT_CLIENT_REREGISTER || event->event == IB_EVENT_GID_CHANGE); } /** * ib_dispatch_event - Dispatch an asynchronous event * @event:Event to dispatch * * Low-level drivers must call ib_dispatch_event() to dispatch the * event to all registered event handlers when an asynchronous event * occurs. */ void ib_dispatch_event(const struct ib_event *event) { struct ib_update_work *work; work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) return; if (is_cache_update_event(event)) INIT_WORK(&work->work, ib_cache_event_task); else INIT_WORK(&work->work, ib_generic_event_task); work->event = *event; if (event->event == IB_EVENT_PKEY_CHANGE || event->event == IB_EVENT_GID_CHANGE) work->enforce_security = true; queue_work(ib_wq, &work->work); } EXPORT_SYMBOL(ib_dispatch_event); int ib_cache_setup_one(struct ib_device *device) { u32 p; int err; err = gid_table_setup_one(device); if (err) return err; rdma_for_each_port (device, p) { err = ib_cache_update(device, p, true, true, true); if (err) { gid_table_cleanup_one(device); return err; } } return 0; } void ib_cache_release_one(struct ib_device *device) { u32 p; /* * The release function frees all the cache elements. * This function should be called as part of freeing * all the device's resources when the cache could no * longer be accessed. */ rdma_for_each_port (device, p) kfree(device->port_data[p].cache.pkey); gid_table_release_one(device); } void ib_cache_cleanup_one(struct ib_device *device) { /* The cleanup function waits for all in-progress workqueue * elements and cleans up the GID cache. This function should be * called after the device was removed from the devices list and * all clients were removed, so the cache exists but is * non-functional and shouldn't be updated anymore. */ flush_workqueue(ib_wq); gid_table_cleanup_one(device); /* * Flush the wq second time for any pending GID delete work. */ flush_workqueue(ib_wq); }
1 12 821 348 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NETLINK_H #define __LINUX_NETLINK_H #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/scm.h> #include <uapi/linux/netlink.h> struct net; void do_trace_netlink_extack(const char *msg); static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) { return (struct nlmsghdr *)skb->data; } enum netlink_skb_flags { NETLINK_SKB_DST = 0x8, /* Dst set in sendto or sendmsg */ }; struct netlink_skb_parms { struct scm_creds creds; /* Skb credentials */ __u32 portid; __u32 dst_group; __u32 flags; struct sock *sk; bool nsid_is_set; int nsid; }; #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) #define NETLINK_CTX_SIZE 48 void netlink_table_grab(void); void netlink_table_ungrab(void); #define NL_CFG_F_NONROOT_RECV (1 << 0) #define NL_CFG_F_NONROOT_SEND (1 << 1) /* optional Netlink kernel configuration parameters */ struct netlink_kernel_cfg { unsigned int groups; unsigned int flags; void (*input)(struct sk_buff *skb); int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); void (*release) (struct sock *sk, unsigned long *groups); }; struct sock *__netlink_kernel_create(struct net *net, int unit, struct module *module, struct netlink_kernel_cfg *cfg); static inline struct sock * netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) { return __netlink_kernel_create(net, unit, THIS_MODULE, cfg); } /* this can be increased when necessary - don't expose to userland */ #define NETLINK_MAX_COOKIE_LEN 20 #define NETLINK_MAX_FMTMSG_LEN 80 /** * struct netlink_ext_ack - netlink extended ACK report struct * @_msg: message string to report - don't access directly, use * %NL_SET_ERR_MSG * @bad_attr: attribute with error * @policy: policy for a bad attribute * @miss_type: attribute type which was missing * @miss_nest: nest missing an attribute (%NULL if missing top level attr) * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length * @_msg_buf: output buffer for formatted message strings - don't access * directly, use %NL_SET_ERR_MSG_FMT */ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; const struct nla_policy *policy; const struct nlattr *miss_nest; u16 miss_type; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; char _msg_buf[NETLINK_MAX_FMTMSG_LEN]; }; /* Always use this macro, this allows later putting the * message into a separate section or such for things * like translation or listing all possible messages. * If string formatting is needed use NL_SET_ERR_MSG_FMT. */ #define NL_SET_ERR_MSG(extack, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) \ __extack->_msg = __msg; \ } while (0) /* We splice fmt with %s at each end even in the snprintf so that both calls * can use the same string constant, avoiding its duplication in .ro */ #define NL_SET_ERR_MSG_FMT(extack, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ } while (0) #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) #define NL_SET_ERR_MSG_FMT_MOD(extack, fmt, args...) \ NL_SET_ERR_MSG_FMT((extack), KBUILD_MODNAME ": " fmt, ##args) #define NL_SET_ERR_MSG_WEAK(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG((extack), msg); \ } while (0) #define NL_SET_ERR_MSG_WEAK_MOD(extack, msg) do { \ if ((extack) && !(extack)->_msg) \ NL_SET_ERR_MSG_MOD((extack), msg); \ } while (0) #define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ if ((extack)) { \ (extack)->bad_attr = (attr); \ (extack)->policy = (pol); \ } \ } while (0) #define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL) #define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do { \ static const char __msg[] = msg; \ struct netlink_ext_ack *__extack = (extack); \ \ do_trace_netlink_extack(__msg); \ \ if (__extack) { \ __extack->_msg = __msg; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } \ } while (0) #define NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, pol, fmt, args...) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (!__extack) \ break; \ \ if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ "%s" fmt "%s", "", ##args, "") >= \ NETLINK_MAX_FMTMSG_LEN) \ net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ ##args, "\n"); \ \ do_trace_netlink_extack(__extack->_msg_buf); \ \ __extack->_msg = __extack->_msg_buf; \ __extack->bad_attr = (attr); \ __extack->policy = (pol); \ } while (0) #define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) #define NL_SET_ERR_MSG_ATTR_FMT(extack, attr, msg, args...) \ NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, NULL, msg, ##args) #define NL_SET_ERR_ATTR_MISS(extack, nest, type) do { \ struct netlink_ext_ack *__extack = (extack); \ \ if (__extack) { \ __extack->miss_nest = (nest); \ __extack->miss_type = (type); \ } \ } while (0) #define NL_REQ_ATTR_CHECK(extack, nest, tb, type) ({ \ struct nlattr **__tb = (tb); \ u32 __attr = (type); \ int __retval; \ \ __retval = !__tb[__attr]; \ if (__retval) \ NL_SET_ERR_ATTR_MISS((extack), (nest), __attr); \ __retval; \ }) static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { if (!extack) return; memcpy(extack->cookie, &cookie, sizeof(cookie)); extack->cookie_len = sizeof(cookie); } void netlink_kernel_release(struct sock *sk); int __netlink_change_ngroups(struct sock *sk, unsigned int groups); int netlink_change_ngroups(struct sock *sk, unsigned int groups); void __netlink_clear_multicast_users(struct sock *sk, unsigned int group); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack); int netlink_has_listeners(struct sock *sk, unsigned int group); bool netlink_strict_get_check(struct sk_buff *skb); int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); typedef int (*netlink_filter_fn)(struct sock *dsk, struct sk_buff *skb, void *data); int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation, netlink_filter_fn filter, void *filter_data); int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code); int netlink_register_notifier(struct notifier_block *nb); int netlink_unregister_notifier(struct notifier_block *nb); /* finegrained unicast helpers: */ struct sock *netlink_getsockbyfd(int fd); int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk); void netlink_detachskb(struct sock *sk, struct sk_buff *skb); int netlink_sendskb(struct sock *sk, struct sk_buff *skb); static inline struct sk_buff * netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *nskb; nskb = skb_clone(skb, gfp_mask); if (!nskb) return NULL; /* This is a large skb, set destructor callback to release head */ if (is_vmalloc_addr(skb->head)) nskb->destructor = skb->destructor; return nskb; } /* * skb should fit one page. This choice is good for headerless malloc. * But we should limit to 8K so that userspace does not have to * use enormous buffer sizes on recvmsg() calls just to avoid * MSG_TRUNC when PAGE_SIZE is very large. */ #if PAGE_SIZE < 8192UL #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(PAGE_SIZE) #else #define NLMSG_GOODSIZE SKB_WITH_OVERHEAD(8192UL) #endif #define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN) struct netlink_callback { struct sk_buff *skb; const struct nlmsghdr *nlh; int (*dump)(struct sk_buff * skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); void *data; /* the module that dump function belong to */ struct module *module; struct netlink_ext_ack *extack; u16 family; u16 answer_flags; u32 min_dump_alloc; unsigned int prev_seq, seq; int flags; bool strict_check; union { u8 ctx[NETLINK_CTX_SIZE]; /* args is deprecated. Cast a struct over ctx instead * for proper type safety. */ long args[6]; }; }; #define NL_ASSERT_CTX_FITS(type_name) \ BUILD_BUG_ON(sizeof(type_name) > \ sizeof_field(struct netlink_callback, ctx)) struct netlink_notify { struct net *net; u32 portid; int protocol; }; struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags); struct netlink_dump_control { int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff *skb, struct netlink_callback *); int (*done)(struct netlink_callback *); struct netlink_ext_ack *extack; void *data; struct module *module; u32 min_dump_alloc; int flags; }; int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control); static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { if (!control->module) control->module = THIS_MODULE; return __netlink_dump_start(ssk, skb, nlh, control); } struct netlink_tap { struct net_device *dev; struct module *module; struct list_head list; }; int netlink_add_tap(struct netlink_tap *nt); int netlink_remove_tap(struct netlink_tap *nt); bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *ns, int cap); bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *ns, int cap); bool netlink_capable(const struct sk_buff *skb, int cap); bool netlink_net_capable(const struct sk_buff *skb, int cap); struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast); #endif /* __LINUX_NETLINK_H */
79 97 125 1 107 21 25 2 1 1 1 1 25 3 3 1 2 131 132 30 29 89 1 13 3 3 76 88 118 118 84 87 7 11 3 43 43 11 47 88 92 29 13 89 89 22 16 46 268 3 41 61 1 4 21 64 2 6 2 84 3 109 132 38 86 83 83 84 96 95 5 11 6 9 56 53 8 115 27 190 11 262 8 2 1 44 7 39 27 21 5 5 1 165 227 177 164 228 53 54 87 242 15 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the TCP module. * * Version: @(#)tcp.h 1.0.5 05/23/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _TCP_H #define _TCP_H #define FASTRETRANS_DEBUG 1 #include <linux/list.h> #include <linux/tcp.h> #include <linux/bug.h> #include <linux/slab.h> #include <linux/cache.h> #include <linux/percpu.h> #include <linux/skbuff.h> #include <linux/kref.h> #include <linux/ktime.h> #include <linux/indirect_call_wrapper.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> #include <net/inet_hashtables.h> #include <net/checksum.h> #include <net/request_sock.h> #include <net/sock_reuseport.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> #include <net/tcp_states.h> #include <net/tcp_ao.h> #include <net/inet_ecn.h> #include <net/dst.h> #include <net/mptcp.h> #include <linux/seq_file.h> #include <linux/memcontrol.h> #include <linux/bpf-cgroup.h> #include <linux/siphash.h> extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); DECLARE_PER_CPU(u32, tcp_tw_isn); void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) #define MAX_TCP_OPTION_SPACE 40 #define TCP_MIN_SND_MSS 48 #define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE) /* * Never offer a window over 32767 without using window scaling. Some * poor stacks do signed 16bit maths! */ #define MAX_TCP_WINDOW 32767U /* Minimal accepted MSS. It is (60+60+8) - (20+20). */ #define TCP_MIN_MSS 88U /* The initial MTU to use for probing */ #define TCP_BASE_MSS 1024 /* probing interval, default to 10 minutes as per RFC4821 */ #define TCP_PROBE_INTERVAL 600 /* Specify interval when tcp mtu probing will stop */ #define TCP_PROBE_THRESHOLD 8 /* After receiving this amount of duplicate ACKs fast retransmit starts. */ #define TCP_FASTRETRANS_THRESH 3 /* Maximal number of ACKs sent quickly to accelerate slow-start. */ #define TCP_MAX_QUICKACKS 16U /* Maximal number of window scale according to RFC1323 */ #define TCP_MAX_WSCALE 14U /* urg_data states */ #define TCP_URG_VALID 0x0100 #define TCP_URG_NOTYET 0x0200 #define TCP_URG_READ 0x0400 #define TCP_RETR1 3 /* * This is how many retries it does before it * tries to figure out if the gateway is * down. Minimal RFC value is 3; it corresponds * to ~3sec-8min depending on RTO. */ #define TCP_RETR2 15 /* * This should take at least * 90 minutes to time out. * RFC1122 says that the limit is 100 sec. * 15 is ~13-30min depending on RTO. */ #define TCP_SYN_RETRIES 6 /* This is how many retries are done * when active opening a connection. * RFC1122 says the minimum retry MUST * be at least 180secs. Nevertheless * this value is corresponding to * 63secs of retransmission with the * current initial RTO. */ #define TCP_SYNACK_RETRIES 5 /* This is how may retries are done * when passive opening a connection. * This is corresponding to 31secs of * retransmission with the current * initial RTO. */ #define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT * state, about 60 seconds */ #define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN /* BSD style FIN_WAIT2 deadlock breaker. * It used to be 3min, new value is 60sec, * to combine FIN-WAIT-2 timeout with * TIME-WAIT timer. */ #define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */ #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #if HZ >= 100 #define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */ #define TCP_ATO_MIN ((unsigned)(HZ/25)) #else #define TCP_DELACK_MIN 4U #define TCP_ATO_MIN 4U #endif #define TCP_RTO_MAX ((unsigned)(120*HZ)) #define TCP_RTO_MIN ((unsigned)(HZ/5)) #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ #define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */ #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the * initial data transmission if no * valid RTT sample has been acquired, * most likely due to retrans in 3WHS. */ #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes * for local resources. */ #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ #define TCP_KEEPALIVE_INTVL (75*HZ) #define MAX_TCP_KEEPIDLE 32767 #define MAX_TCP_KEEPINTVL 32767 #define MAX_TCP_KEEPCNT 127 #define MAX_TCP_SYNCNT 127 /* Ensure that TCP PAWS checks are relaxed after ~2147 seconds * to avoid overflows. This assumes a clock smaller than 1 Mhz. * Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz. */ #define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC) #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated * after this time. It should be equal * (or greater than) TCP_TIMEWAIT_LEN * to provide reliability equal to one * provided by timewait state. */ #define TCP_PAWS_WINDOW 1 /* Replay window for per-host * timestamps. It must be less than * minimal timewait lifetime. */ /* * TCP option */ #define TCPOPT_NOP 1 /* Padding */ #define TCPOPT_EOL 0 /* End of options */ #define TCPOPT_MSS 2 /* Segment size negotiating */ #define TCPOPT_WINDOW 3 /* Window scaling */ #define TCPOPT_SACK_PERM 4 /* SACK Permitted */ #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ #define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 #define TCPOPT_SMC_MAGIC 0xE2D4C3D9 /* * TCP option lengths */ #define TCPOLEN_MSS 4 #define TCPOLEN_WINDOW 3 #define TCPOLEN_SACK_PERM 2 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 /* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED 12 #define TCPOLEN_WSCALE_ALIGNED 4 #define TCPOLEN_SACKPERM_ALIGNED 4 #define TCPOLEN_SACK_BASE 2 #define TCPOLEN_SACK_BASE_ALIGNED 4 #define TCPOLEN_SACK_PERBLOCK 8 #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_CORK 2 /* Socket is corked */ #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */ /* TCP thin-stream limits */ #define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */ /* TCP initial congestion window as per rfc6928 */ #define TCP_INIT_CWND 10 /* Bit Flags for sysctl_tcp_fastopen */ #define TFO_CLIENT_ENABLE 1 #define TFO_SERVER_ENABLE 2 #define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */ /* Accept SYN data w/o any cookie option */ #define TFO_SERVER_COOKIE_NOT_REQD 0x200 /* Force enable TFO on all listeners, i.e., not requiring the * TCP_FASTOPEN socket option. */ #define TFO_SERVER_WO_SOCKOPT1 0x400 /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ extern atomic_long_t tcp_memory_allocated; DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); extern struct percpu_counter tcp_sockets_allocated; extern unsigned long tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) { if (mem_cgroup_sockets_enabled && sk->sk_memcg && mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; return READ_ONCE(tcp_memory_pressure); } /* * The next routines deal with comparing 32 bit unsigned ints * and worry about wraparound (automatic with unsigned arithmetic). */ static inline bool before(__u32 seq1, __u32 seq2) { return (__s32)(seq1-seq2) < 0; } #define after(seq2, seq1) before(seq1, seq2) /* is s2<=s1<=s3 ? */ static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3) { return seq3 - seq2 >= seq1 - seq2; } static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sk_wmem_queued_add(sk, -skb->truesize); if (!skb_zcopy_pure(skb)) sk_mem_uncharge(sk, skb->truesize); else sk_mem_uncharge(sk, SKB_TRUESIZE(skb_end_offset(skb))); __kfree_skb(skb); } void sk_forced_mem_schedule(struct sock *sk, int size); bool tcp_check_oom(const struct sock *sk, int shift); extern struct proto tcp_prot; #define TCP_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.tcp_statistics, field) #define __TCP_INC_STATS(net, field) __SNMP_INC_STATS((net)->mib.tcp_statistics, field) #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) void tcp_tasklet_init(void); int tcp_v4_err(struct sk_buff *skb, u32); void tcp_shutdown(struct sock *sk, int how); int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); void tcp_remove_empty_skb(struct sock *sk); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg); void tcp_splice_eof(struct socket *sock); int tcp_send_mss(struct sock *sk, int *size_goal, int flags); int tcp_wmem_schedule(struct sock *sk, int copy); void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal); void tcp_release_cb(struct sock *sk); void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, int *karg); enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); void tcp_twsk_purge(struct list_head *net_exit_list); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule); static inline void tcp_dec_quickack_mode(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ack.quick) { /* How many ACKs S/ACKing new data have we sent? */ const unsigned int pkts = inet_csk_ack_scheduled(sk) ? 1 : 0; if (pkts >= icsk->icsk_ack.quick) { icsk->icsk_ack.quick = 0; /* Leaving quickack mode we deflate ATO. */ icsk->icsk_ack.ato = TCP_ATO_MIN; } else icsk->icsk_ack.quick -= pkts; } } #define TCP_ECN_OK 1 #define TCP_ECN_QUEUE_CWR 2 #define TCP_ECN_DEMAND_CWR 4 #define TCP_ECN_SEEN 8 enum tcp_tw_status { TCP_TW_SUCCESS = 0, TCP_TW_RST = 1, TCP_TW_ACK = 2, TCP_TW_SYN = 3 }; enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, u32 *tw_isn); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race); enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag); void tcp_clear_retrans(struct tcp_sock *tp); void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void __tcp_close(struct sock *sk, long timeout); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int do_tcp_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); bool tcp_bpf_bypass_getsockopt(int level, int optname); int do_tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); int tcp_set_window_clamp(struct sock *sk, int val); void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping_internal *tss); void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss); void tcp_data_ready(struct sock *sk); #ifdef CONFIG_MMU int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); #endif void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); /* * BPF SKB-less helpers */ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, struct tcphdr *th, u32 *cookie); u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, struct tcphdr *th, u32 *cookie); u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss); u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct tcphdr *th); /* * TCP v4 functions exported for the inet6 API */ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); void tcp_req_err(struct sock *sk, u32 seq, bool abort); void tcp_ld_RTO_revert(struct sock *sk, u32 seq); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb); void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst); struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req); int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int tcp_connect(struct sock *sk); enum tcp_synack_type { TCP_SYNACK_NORMAL, TCP_SYNACK_FASTOPEN, TCP_SYNACK_COOKIE, }; struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size); void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst); int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb, struct tcp_options_received *tcp_opt, int mss, u32 tsoff); #if IS_ENABLED(CONFIG_BPF) struct bpf_tcp_req_attrs { u32 rcv_tsval; u32 rcv_tsecr; u16 mss; u8 rcv_wscale; u8 snd_wscale; u8 ecn_ok; u8 wscale_ok; u8 sack_ok; u8 tstamp_ok; u8 usec_ts_ok; u8 reserved[3]; }; #endif #ifdef CONFIG_SYN_COOKIES /* Syncookies use a monotonic timer which increments every 60 seconds. * This counter is used both as a hash input and partially encoded into * the cookie value. A cookie is only validated further if the delta * between the current counter value and the encoded one is less than this, * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if * the counter advances immediately after a cookie is generated). */ #define MAX_SYNCOOKIE_AGE 2 #define TCP_SYNCOOKIE_PERIOD (60 * HZ) #define TCP_SYNCOOKIE_VALID (MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD) /* syncookies: remember time of last synqueue overflow * But do not dirty this field too often (once per second is enough) * It is racy as we do not hold a lock, but race is very minor. */ static inline void tcp_synq_overflow(const struct sock *sk) { unsigned int last_overflow; unsigned int now = jiffies; if (sk->sk_reuseport) { struct sock_reuseport *reuse; reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); if (!time_between32(now, last_overflow, last_overflow + HZ)) WRITE_ONCE(reuse->synq_overflow_ts, now); return; } } last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); if (!time_between32(now, last_overflow, last_overflow + HZ)) WRITE_ONCE(tcp_sk_rw(sk)->rx_opt.ts_recent_stamp, now); } /* syncookies: no recent synqueue overflow on this listening socket? */ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) { unsigned int last_overflow; unsigned int now = jiffies; if (sk->sk_reuseport) { struct sock_reuseport *reuse; reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); return !time_between32(now, last_overflow - HZ, last_overflow + TCP_SYNCOOKIE_VALID); } } last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID, * then we're under synflood. However, we have to use * 'last_overflow - HZ' as lower bound. That's because a concurrent * tcp_synq_overflow() could update .ts_recent_stamp after we read * jiffies but before we store .ts_recent_stamp into last_overflow, * which could lead to rejecting a valid syncookie. */ return !time_between32(now, last_overflow - HZ, last_overflow + TCP_SYNCOOKIE_VALID); } static inline u32 tcp_cookie_time(void) { u64 val = get_jiffies_64(); do_div(val, TCP_SYNCOOKIE_PERIOD); return val; } /* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */ static inline u64 tcp_ns_to_ts(bool usec_ts, u64 val) { if (usec_ts) return div_u64(val, NSEC_PER_USEC); return div_u64(val, NSEC_PER_MSEC); } u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); u64 cookie_init_timestamp(struct request_sock *req, u64 now); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *dst) { return READ_ONCE(net->ipv4.sysctl_tcp_ecn) || dst_feature(dst, RTAX_FEATURE_ECN); } #if IS_ENABLED(CONFIG_BPF) static inline bool cookie_bpf_ok(struct sk_buff *skb) { return skb->sk; } struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb); #else static inline bool cookie_bpf_ok(struct sk_buff *skb) { return false; } static inline struct request_sock *cookie_bpf_check(struct net *net, struct sock *sk, struct sk_buff *skb) { return NULL; } #endif /* From net/ipv6/syncookies.c */ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); #endif /* tcp_output.c */ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb); void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb); void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); void tcp_retransmit_timer(struct sock *sk); void tcp_xmit_retransmit_queue(struct sock *); void tcp_simple_retransmit(struct sock *); void tcp_enter_recovery(struct sock *sk, bool ece_ack); int tcp_trim_head(struct sock *, struct sk_buff *, u32); enum tcp_queue { TCP_FRAG_IN_WRITE_QUEUE, TCP_FRAG_IN_RTX_QUEUE, }; int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp); void tcp_send_probe0(struct sock *); int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); void tcp_send_active_reset(struct sock *sk, gfp_t priority, enum sk_rst_reason reason); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto); void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb); /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_done_with_error(struct sock *sk, int err); void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_fin(struct sock *sk); void tcp_check_space(struct sock *sk); void tcp_sack_compress_send_ack(struct sock *sk); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) { if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1) __sock_put(sk); if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1) __sock_put(sk); inet_csk_clear_xmit_timers(sk); } unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu); unsigned int tcp_current_mss(struct sock *sk); u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when); /* Bound MSS / TSO packet size with the half of the window */ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) { int cutoff; /* When peer uses tiny windows, there is no use in packetizing * to sub-MSS pieces for the sake of SWS or making sure there * are enough packets in the pipe for fast recovery. * * On the other hand, for extremely large MSS devices, handling * smaller than MSS windows in this way does make sense. */ if (tp->max_window > TCP_MSS_DEFAULT) cutoff = (tp->max_window >> 1); else cutoff = tp->max_window; if (cutoff && pktsize > cutoff) return max_t(int, cutoff, 68U - tp->tcp_header_len); else return pktsize; } /* tcp.c */ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off); void tcp_read_done(struct sock *sk, size_t len); void tcp_initialize_rcv_mss(struct sock *sk); int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); static inline void tcp_bound_rto(struct sock *sk) { if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) inet_csk(sk)->icsk_rto = TCP_RTO_MAX; } static inline u32 __tcp_set_rto(const struct tcp_sock *tp) { return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); } static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { /* mptcp hooks are only on the slow path */ if (sk_is_mptcp((struct sock *)tp)) return; tp->pred_flags = htonl((tp->tcp_header_len << 26) | ntohl(TCP_FLAG_ACK) | snd_wnd); } static inline void tcp_fast_path_on(struct tcp_sock *tp) { __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); } static inline void tcp_fast_path_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && tp->rcv_wnd && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && !tp->urg_data) tcp_fast_path_on(tp); } u32 tcp_delack_max(const struct sock *sk); /* Compute the actual rto_min value */ static inline u32 tcp_rto_min(const struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); u32 rto_min = inet_csk(sk)->icsk_rto_min; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); return rto_min; } static inline u32 tcp_rto_min_us(const struct sock *sk) { return jiffies_to_usecs(tcp_rto_min(sk)); } static inline bool tcp_ca_dst_locked(const struct dst_entry *dst) { return dst_metric_locked(dst, RTAX_CC_ALGO); } /* Minimum RTT in usec. ~0 means not available. */ static inline u32 tcp_min_rtt(const struct tcp_sock *tp) { return minmax_get(&tp->rtt_min); } /* Compute the actual receive window we are currently advertising. * Rcv_nxt can be after the window if our peer push more data * than the offered window. */ static inline u32 tcp_receive_window(const struct tcp_sock *tp) { s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; if (win < 0) win = 0; return (u32) win; } /* Choose a new window, without checks for shrinking, and without * scaling applied to the result. The caller does these things * if necessary. This is a "raw" window selection. */ u32 __tcp_select_window(struct sock *sk); void tcp_send_window_probe(struct sock *sk); /* TCP uses 32bit jiffies to save some space. * Note that this is different from tcp_time_stamp, which * historically has been the same until linux-4.13. */ #define tcp_jiffies32 ((u32)jiffies) /* * Deliver a 32bit value for TCP timestamp option (RFC 7323) * It is no longer tied to jiffies, but to 1 ms clock. * Note: double check if you want to use tcp_jiffies32 instead of this. */ #define TCP_TS_HZ 1000 static inline u64 tcp_clock_ns(void) { return ktime_get_ns(); } static inline u64 tcp_clock_us(void) { return div_u64(tcp_clock_ns(), NSEC_PER_USEC); } static inline u64 tcp_clock_ms(void) { return div_u64(tcp_clock_ns(), NSEC_PER_MSEC); } /* TCP Timestamp included in TS option (RFC 1323) can either use ms * or usec resolution. Each socket carries a flag to select one or other * resolution, as the route attribute could change anytime. * Each flow must stick to initial resolution. */ static inline u32 tcp_clock_ts(bool usec_ts) { return usec_ts ? tcp_clock_us() : tcp_clock_ms(); } static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp) { return div_u64(tp->tcp_mstamp, USEC_PER_MSEC); } static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp) { if (tp->tcp_usec_ts) return tp->tcp_mstamp; return tcp_time_stamp_ms(tp); } void tcp_mstamp_refresh(struct tcp_sock *tp); static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) { return max_t(s64, t1 - t0, 0); } /* provide the departure time in us unit */ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) { return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC); } /* Provide skb TSval in usec or ms unit */ static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb) { if (usec_ts) return tcp_skb_timestamp_us(skb); return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC); } static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw) { return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset; } static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) { return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off; } #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) #define TCPHDR_FIN 0x01 #define TCPHDR_SYN 0x02 #define TCPHDR_RST 0x04 #define TCPHDR_PSH 0x08 #define TCPHDR_ACK 0x10 #define TCPHDR_URG 0x20 #define TCPHDR_ECE 0x40 #define TCPHDR_CWR 0x80 #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */ TCPCB_SACKED_RETRANS = (1 << 1), /* SKB retransmitted */ TCPCB_LOST = (1 << 2), /* SKB is lost */ TCPCB_TAGBITS = (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS | TCPCB_LOST), /* All tag bits */ TCPCB_REPAIRED = (1 << 4), /* SKB repaired (no skb_mstamp_ns) */ TCPCB_EVER_RETRANS = (1 << 7), /* Ever retransmitted frame */ TCPCB_RETRANS = (TCPCB_SACKED_RETRANS | TCPCB_EVER_RETRANS | TCPCB_REPAIRED), }; /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. * We also store the host-order sequence numbers in here too. * This is 44 bytes if IPV6 is enabled. * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately. */ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { /* Note : * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ struct { u16 tcp_gso_segs; u16 tcp_gso_size; }; }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK. */ __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ has_rxtstamp:1, /* SKB has a RX timestamp */ unused:5; __u32 ack_seq; /* Sequence number ACK'd */ union { struct { #define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1) /* There is space for up to 24 bytes */ __u32 is_app_limited:1, /* cwnd not fully used? */ delivered_ce:20, unused:11; /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ u64 first_tx_mstamp; /* when we reached the "delivered" count */ u64 delivered_mstamp; } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, * as TCP moves IP6CB into a different location in skb->cb[] */ static inline int tcp_v6_iif(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->header.h6.iif; } static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif; } /* TCP_SKB_CB reference means this can not be used from early demux */ static inline int tcp_v6_sdif(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags)) return TCP_SKB_CB(skb)->header.h6.iif; #endif return 0; } extern const struct inet_connection_sock_af_ops ipv6_specific; INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb)); void tcp_v6_early_demux(struct sk_buff *skb); #endif /* TCP_SKB_CB reference means this can not be used from early demux */ static inline int tcp_v4_sdif(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) return TCP_SKB_CB(skb)->header.h4.iif; #endif return 0; } /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ static inline int tcp_skb_pcount(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->tcp_gso_segs; } static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs) { TCP_SKB_CB(skb)->tcp_gso_segs = segs; } static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs) { TCP_SKB_CB(skb)->tcp_gso_segs += segs; } /* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */ static inline int tcp_skb_mss(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->tcp_gso_size; } static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) { return likely(!TCP_SKB_CB(skb)->eor); } static inline bool tcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { /* skb_cmp_decrypted() not needed, use tcp_write_collapse_fence() */ return likely(tcp_skb_can_collapse_to(to) && mptcp_skb_can_collapse(to, from) && skb_pure_zcopy_same(to, from) && skb_frags_readable(to) == skb_frags_readable(from)); } static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to, const struct sk_buff *from) { return likely(mptcp_skb_can_collapse(to, from) && !skb_cmp_decrypted(to, from)); } /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ }; /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ enum tcp_ca_ack_event_flags { CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ }; /* * Interface for adding new TCP congestion control handlers */ #define TCP_CA_NAME_MAX 16 #define TCP_CA_MAX 128 #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) #define TCP_CA_UNSPEC 0 /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 #define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) union tcp_cc_info; struct ack_sample { u32 pkts_acked; s32 rtt_us; u32 in_flight; }; /* A rate sample measures the number of (original/retransmitted) data * packets delivered "delivered" over an interval of time "interval_us". * The tcp_rate.c code fills in the rate sample, and congestion * control modules that define a cong_control function to run at the end * of ACK processing can optionally chose to consult this sample when * setting cwnd and pacing rate. * A sample is invalid if "delivered" or "interval_us" is negative. */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ s32 delivered; /* number of packets delivered over interval */ s32 delivered_ce; /* number of packets delivered w/ CE marks*/ long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ int losses; /* number of packets marked lost upon ACK */ u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ u32 prior_in_flight; /* in flight before this ACK */ u32 last_end_seq; /* end_seq of most recently ACKed packet */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ }; struct tcp_congestion_ops { /* fast path fields are put first to fill one cache line */ /* return slow start threshold (required) */ u32 (*ssthresh)(struct sock *sk); /* do new cwnd calculation (required) */ void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked); /* call before changing ca_state (optional) */ void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); /* call when ack arrives (optional) */ void (*in_ack_event)(struct sock *sk, u32 flags); /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); /* override sysctl_tcp_min_tso_segs */ u32 (*min_tso_segs)(struct sock *sk); /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs); /* new value of cwnd after loss (required) */ u32 (*undo_cwnd)(struct sock *sk); /* returns the multiplier used in tcp_sndbuf_expand (optional) */ u32 (*sndbuf_expand)(struct sock *sk); /* control/slow paths put last */ /* get info for inet_diag (optional) */ size_t (*get_info)(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); char name[TCP_CA_NAME_MAX]; struct module *owner; struct list_head list; u32 key; u32 flags; /* initialize private data (optional) */ void (*init)(struct sock *sk); /* cleanup private data (optional) */ void (*release)(struct sock *sk); } ____cacheline_aligned_in_smp; int tcp_register_congestion_control(struct tcp_congestion_ops *type); void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); int tcp_update_congestion_control(struct tcp_congestion_ops *type, struct tcp_congestion_ops *old_type); int tcp_validate_congestion_control(struct tcp_congestion_ops *ca); void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); void tcp_cleanup_congestion_control(struct sock *sk); int tcp_set_default_congestion_control(struct net *net, const char *name); void tcp_get_default_congestion_control(struct net *net, char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); u32 tcp_reno_ssthresh(struct sock *sk); u32 tcp_reno_undo_cwnd(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find(const char *name); struct tcp_congestion_ops *tcp_ca_find_key(u32 key); u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) { return NULL; } #endif static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; } static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->cwnd_event) icsk->icsk_ca_ops->cwnd_event(sk, event); } /* From tcp_cong.c */ void tcp_set_ca_state(struct sock *sk, const u8 ca_state); /* From tcp_rate.c */ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) { return t1 > t2 || (t1 == t2 && after(seq1, seq2)); } /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. * * tcp_is_sack - SACK enabled * tcp_is_reno - No SACK */ static inline int tcp_is_sack(const struct tcp_sock *tp) { return likely(tp->rx_opt.sack_ok); } static inline bool tcp_is_reno(const struct tcp_sock *tp) { return !tcp_is_sack(tp); } static inline unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; } /* This determines how many packets are "in the network" to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK * blocks etc.) we can make more aggressive calculations. * * Use this for decisions involving congestion control, use just * tp->packets_out to determine if the send queue is empty or not. * * Read this equation as: * * "Packets sent once on transmission queue" MINUS * "Packets left network, but not honestly ACKed yet" PLUS * "Packets fast retransmitted" */ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) { return tp->packets_out - tcp_left_out(tp) + tp->retrans_out; } #define TCP_INFINITE_SSTHRESH 0x7fffffff static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp) { return tp->snd_cwnd; } static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val) { WARN_ON_ONCE((int)val <= 0); tp->snd_cwnd = val; } static inline bool tcp_in_slow_start(const struct tcp_sock *tp) { return tcp_snd_cwnd(tp) < tp->snd_ssthresh; } static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) { return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH; } static inline bool tcp_in_cwnd_reduction(const struct sock *sk) { return (TCPF_CA_CWR | TCPF_CA_Recovery) & (1 << inet_csk(sk)->icsk_ca_state); } /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. * The exception is cwnd reduction phase, when cwnd is decreasing towards * ssthresh. */ static inline __u32 tcp_current_ssthresh(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (tcp_in_cwnd_reduction(sk)) return tp->snd_ssthresh; else return max(tp->snd_ssthresh, ((tcp_snd_cwnd(tp) >> 1) + (tcp_snd_cwnd(tp) >> 2))); } /* Use define here intentionally to get WARN_ON location shown at the caller */ #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) void tcp_enter_cwr(struct sock *sk); __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst); /* The maximum number of MSS of available cwnd for which TSO defers * sending if not using sysctl_tcp_tso_win_divisor. */ static inline __u32 tcp_max_tso_deferred_mss(const struct tcp_sock *tp) { return 3; } /* Returns end sequence number of the receiver's advertised window */ static inline u32 tcp_wnd_end(const struct tcp_sock *tp) { return tp->snd_una + tp->snd_wnd; } /* We follow the spirit of RFC2861 to validate cwnd but implement a more * flexible approach. The RFC suggests cwnd should not be raised unless * it was fully used previously. And that's exactly what we do in * congestion avoidance mode. But in slow start we allow cwnd to grow * as long as the application has used half the cwnd. * Example : * cwnd is 10 (IW10), but application sends 9 frames. * We allow cwnd to reach 18 when all frames are ACKed. * This check is safe because it's as aggressive as slow start which already * risks 100% overshoot. The advantage is that we discourage application to * either send more filler packets or data to artificially blow up the cwnd * usage, and allow application-limited process to probe bw more aggressively. */ static inline bool tcp_is_cwnd_limited(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (tp->is_cwnd_limited) return true; /* If in slow start, ensure cwnd grows to twice what was ACKed. */ if (tcp_in_slow_start(tp)) return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out; return false; } /* BBR congestion control needs pacing. * Same remark for SO_MAX_PACING_RATE. * sch_fq packet scheduler is efficiently handling pacing, * but is not always installed/used. * Return true if TCP stack should pace packets itself. */ static inline bool tcp_needs_internal_pacing(const struct sock *sk) { return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; } /* Estimates in how many jiffies next packet for this flow can be sent. * Scheduling a retransmit timer too early would be silly. */ static inline unsigned long tcp_pacing_delay(const struct sock *sk) { s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache; return delay > 0 ? nsecs_to_jiffies(delay) : 0; } static inline void tcp_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, const unsigned long max_when) { inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk), max_when); } /* Something is really bad, we could not queue an additional packet, * because qdisc is full or receiver sent a 0 window, or we are paced. * We do not want to add fuel to the fire, or abort too early, * so make sure the timer we arm now is at least 200ms in the future, * regardless of current icsk_rto value (as it could be ~2ms) */ static inline unsigned long tcp_probe0_base(const struct sock *sk) { return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN); } /* Variant of inet_csk_rto_backoff() used for zero window probes */ static inline unsigned long tcp_probe0_when(const struct sock *sk, unsigned long max_when) { u8 backoff = min_t(u8, ilog2(TCP_RTO_MAX / TCP_RTO_MIN) + 1, inet_csk(sk)->icsk_backoff); u64 when = (u64)tcp_probe0_base(sk) << backoff; return (unsigned long)min_t(u64, when, max_when); } static inline void tcp_check_probe_timer(struct sock *sk) { if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, tcp_probe0_base(sk), TCP_RTO_MAX); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) { tp->snd_wl1 = seq; } static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq) { tp->snd_wl1 = seq; } /* * Calculate(/check) TCP checksum */ static inline __sum16 tcp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base); } static inline bool tcp_checksum_complete(struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && __skb_checksum_complete(skb); } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); int tcp_filter(struct sock *sk, struct sk_buff *skb); void tcp_set_state(struct sock *sk, int state); void tcp_done(struct sock *sk); int tcp_abort(struct sock *sk, int err); static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) { rx_opt->dsack = 0; rx_opt->num_sacks = 0; } void tcp_cwnd_restart(struct sock *sk, s32 delta); static inline void tcp_slow_start_after_idle_check(struct sock *sk) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); s32 delta; if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) || tp->packets_out || ca_ops->cong_control) return; delta = tcp_jiffies32 - tp->lsndtime; if (delta > inet_csk(sk)->icsk_rto) tcp_cwnd_restart(sk, delta); } /* Determine a window scaling and initial window to offer. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); static inline int __tcp_win_from_space(u8 scaling_ratio, int space) { s64 scaled_space = (s64)space * scaling_ratio; return scaled_space >> TCP_RMEM_TO_WIN_SCALE; } static inline int tcp_win_from_space(const struct sock *sk, int space) { return __tcp_win_from_space(tcp_sk(sk)->scaling_ratio, space); } /* inverse of __tcp_win_from_space() */ static inline int __tcp_space_from_win(u8 scaling_ratio, int win) { u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE; do_div(val, scaling_ratio); return val; } static inline int tcp_space_from_win(const struct sock *sk, int win) { return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win); } /* Assume a 50% default for skb->len/skb->truesize ratio. * This may be adjusted later in tcp_measure_rcv_mss(). */ #define TCP_DEFAULT_SCALING_RATIO (1 << (TCP_RMEM_TO_WIN_SCALE - 1)) static inline void tcp_scaling_ratio_init(struct sock *sk) { tcp_sk(sk)->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; } /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - READ_ONCE(sk->sk_backlog.len) - atomic_read(&sk->sk_rmem_alloc)); } static inline int tcp_full_space(const struct sock *sk) { return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } static inline void __tcp_adjust_rcv_ssthresh(struct sock *sk, u32 new_ssthresh) { int unused_mem = sk_unused_reserved_mem(sk); struct tcp_sock *tp = tcp_sk(sk); tp->rcv_ssthresh = min(tp->rcv_ssthresh, new_ssthresh); if (unused_mem) tp->rcv_ssthresh = max_t(u32, tp->rcv_ssthresh, tcp_win_from_space(sk, unused_mem)); } static inline void tcp_adjust_rcv_ssthresh(struct sock *sk) { __tcp_adjust_rcv_ssthresh(sk, 4U * tcp_sk(sk)->advmss); } void tcp_cleanup_rbuf(struct sock *sk, int copied); void __tcp_cleanup_rbuf(struct sock *sk, int copied); /* We provision sk_rcvbuf around 200% of sk_rcvlowat. * If 87.5 % (7/8) of the space has been consumed, we want to override * SO_RCVLOWAT constraint, since we are receiving skbs with too small * len/truesize ratio. */ static inline bool tcp_rmem_pressure(const struct sock *sk) { int rcvbuf, threshold; if (tcp_under_memory_pressure(sk)) return true; rcvbuf = READ_ONCE(sk->sk_rcvbuf); threshold = rcvbuf - (rcvbuf >> 3); return atomic_read(&sk->sk_rmem_alloc) > threshold; } static inline bool tcp_epollin_ready(const struct sock *sk, int target) { const struct tcp_sock *tp = tcp_sk(sk); int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq); if (avail <= 0) return false; return (avail >= target) || tcp_rmem_pressure(sk) || (tcp_receive_window(tp) <= inet_csk(sk)->icsk_ack.rcv_mss); } extern void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst); void tcp_enter_memory_pressure(struct sock *sk); void tcp_leave_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); int val; /* Paired with WRITE_ONCE() in tcp_sock_set_keepintvl() * and do_tcp_setsockopt(). */ val = READ_ONCE(tp->keepalive_intvl); return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl); } static inline int keepalive_time_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); int val; /* Paired with WRITE_ONCE() in tcp_sock_set_keepidle_locked() */ val = READ_ONCE(tp->keepalive_time); return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); } static inline int keepalive_probes(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); int val; /* Paired with WRITE_ONCE() in tcp_sock_set_keepcnt() * and do_tcp_setsockopt(). */ val = READ_ONCE(tp->keepalive_probes); return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes); } static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) { const struct inet_connection_sock *icsk = &tp->inet_conn; return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime, tcp_jiffies32 - tp->rcv_tstamp); } static inline int tcp_fin_time(const struct sock *sk) { int fin_timeout = tcp_sk(sk)->linger2 ? : READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fin_timeout); const int rto = inet_csk(sk)->icsk_rto; if (fin_timeout < (rto << 2) - (rto >> 1)) fin_timeout = (rto << 2) - (rto >> 1); return fin_timeout; } static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt, int paws_win) { if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) return true; if (unlikely(!time_before32(ktime_get_seconds(), rx_opt->ts_recent_stamp + TCP_PAWS_WRAP))) return true; /* * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0, * then following tcp messages have valid values. Ignore 0 value, * or else 'negative' tsval might forbid us to accept their packets. */ if (!rx_opt->ts_recent) return true; return false; } static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, int rst) { if (tcp_paws_check(rx_opt, 0)) return false; /* RST segments are not recommended to carry timestamp, and, if they do, it is recommended to ignore PAWS because "their cleanup function should take precedence over timestamps." Certainly, it is mistake. It is necessary to understand the reasons of this constraint to relax it: if peer reboots, clock may go out-of-sync and half-open connections will not be reset. Actually, the problem would be not existing if all the implementations followed draft about maintaining clock via reboots. Linux-2.2 DOES NOT! However, we can relax time bounds for RST segments to MSL. */ if (rst && !time_before32(ktime_get_seconds(), rx_opt->ts_recent_stamp + TCP_PAWS_MSL)) return false; return true; } bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, int mib_idx, u32 *last_oow_ack_time); static inline void tcp_mib_init(struct net *net) { /* See RFC 2012 */ TCP_ADD_STATS(net, TCP_MIB_RTOALGORITHM, 1); TCP_ADD_STATS(net, TCP_MIB_RTOMIN, TCP_RTO_MIN*1000/HZ); TCP_ADD_STATS(net, TCP_MIB_RTOMAX, TCP_RTO_MAX*1000/HZ); TCP_ADD_STATS(net, TCP_MIB_MAXCONN, -1); } /* from STCP */ static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) { tp->lost_skb_hint = NULL; } static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { tcp_clear_retrans_hints_partial(tp); tp->retransmit_skb_hint = NULL; } #define tcp_md5_addr tcp_ao_addr /* - key database */ struct tcp_md5sig_key { struct hlist_node node; u8 keylen; u8 family; /* AF_INET or AF_INET6 */ u8 prefixlen; u8 flags; union tcp_md5_addr addr; int l3index; /* set if key added with L3 scope */ u8 key[TCP_MD5SIG_MAXKEYLEN]; struct rcu_head rcu; }; /* - sock block */ struct tcp_md5sig_info { struct hlist_head head; struct rcu_head rcu; }; /* - pseudo header */ struct tcp4_pseudohdr { __be32 saddr; __be32 daddr; __u8 pad; __u8 protocol; __be16 len; }; struct tcp6_pseudohdr { struct in6_addr saddr; struct in6_addr daddr; __be32 len; __be32 protocol; /* including padding */ }; union tcp_md5sum_block { struct tcp4_pseudohdr ip4; #if IS_ENABLED(CONFIG_IPV6) struct tcp6_pseudohdr ip6; #endif }; /* * struct tcp_sigpool - per-CPU pool of ahash_requests * @scratch: per-CPU temporary area, that can be used between * tcp_sigpool_start() and tcp_sigpool_end() to perform * crypto request * @req: pre-allocated ahash request */ struct tcp_sigpool { void *scratch; struct ahash_request *req; }; int tcp_sigpool_alloc_ahash(const char *alg, size_t scratch_size); void tcp_sigpool_get(unsigned int id); void tcp_sigpool_release(unsigned int id); int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp, const struct sk_buff *skb, unsigned int header_len); /** * tcp_sigpool_start - disable bh and start using tcp_sigpool_ahash * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() * @c: returned tcp_sigpool for usage (uninitialized on failure) * * Returns 0 on success, error otherwise. */ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c); /** * tcp_sigpool_end - enable bh and stop using tcp_sigpool * @c: tcp_sigpool context that was returned by tcp_sigpool_start() */ void tcp_sigpool_end(struct tcp_sigpool *c); size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len); /* - functions */ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen); int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, struct tcp_md5sig_key *key); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags); void tcp_clear_md5_list(struct sock *sk); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family, bool any_l3index); static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { if (!static_branch_unlikely(&tcp_md5_needed.key)) return NULL; return __tcp_md5_do_lookup(sk, l3index, addr, family, false); } static inline struct tcp_md5sig_key * tcp_md5_do_lookup_any_l3index(const struct sock *sk, const union tcp_md5_addr *addr, int family) { if (!static_branch_unlikely(&tcp_md5_needed.key)) return NULL; return __tcp_md5_do_lookup(sk, 0, addr, family, true); } #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) #else static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { return NULL; } static inline struct tcp_md5sig_key * tcp_md5_do_lookup_any_l3index(const struct sock *sk, const union tcp_md5_addr *addr, int family) { return NULL; } #define tcp_twsk_md5_key(twsk) NULL #endif int tcp_md5_alloc_sigpool(void); void tcp_md5_release_sigpool(void); void tcp_md5_add_sigpool(void); extern int tcp_md5_sigpool_id; int tcp_md5_hash_key(struct tcp_sigpool *hp, const struct tcp_md5sig_key *key); /* From tcp_fastopen.c */ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_cookie *cookie, bool syn_lost, u16 try_exp); struct tcp_fastopen_request { /* Fast Open cookie. Size 0 means a cookie request */ struct tcp_fastopen_cookie cookie; struct msghdr *data; /* data in MSG_FASTOPEN */ size_t size; int copied; /* queued in tcp_connect() */ struct ubuf_info *uarg; }; void tcp_free_fastopen_req(struct tcp_sock *tp); void tcp_fastopen_destroy_cipher(struct sock *sk); void tcp_fastopen_ctx_destroy(struct net *net); int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void *primary_key, void *backup_key); int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk, u64 *key); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc, const struct dst_entry *dst); void tcp_fastopen_init_key_once(struct net *net); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); #define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t) #define TCP_FASTOPEN_KEY_MAX 2 #define TCP_FASTOPEN_KEY_BUF_LENGTH \ (TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX) /* Fastopen key context */ struct tcp_fastopen_context { siphash_key_t key[TCP_FASTOPEN_KEY_MAX]; int num; struct rcu_head rcu; }; void tcp_fastopen_active_disable(struct sock *sk); bool tcp_fastopen_active_should_disable(struct sock *sk); void tcp_fastopen_active_disable_ofo_check(struct sock *sk); void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired); /* Caller needs to wrap with rcu_read_(un)lock() */ static inline struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk) { struct tcp_fastopen_context *ctx; ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); if (!ctx) ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); return ctx; } static inline bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc, const struct tcp_fastopen_cookie *orig) { if (orig->len == TCP_FASTOPEN_COOKIE_SIZE && orig->len == foc->len && !memcmp(orig->val, foc->val, foc->len)) return true; return false; } static inline int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx) { return ctx->num; } /* Latencies incurred by various limits for a sender. They are * chronograph-like stats that are mutually exclusive. */ enum tcp_chrono { TCP_CHRONO_UNSPEC, TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */ TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */ TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */ __TCP_CHRONO_MAX, }; void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type); void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type); /* This helper is needed, because skb->tcp_tsorted_anchor uses * the same memory storage than skb->destructor/_skb_refdst */ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb) { skb->destructor = NULL; skb->_skb_refdst = 0UL; } #define tcp_skb_tsorted_save(skb) { \ unsigned long _save = skb->_skb_refdst; \ skb->_skb_refdst = 0UL; #define tcp_skb_tsorted_restore(skb) \ skb->_skb_refdst = _save; \ } void tcp_write_queue_purge(struct sock *sk); static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk) { return skb_rb_first(&sk->tcp_rtx_queue); } static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) { return skb_rb_last(&sk->tcp_rtx_queue); } static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk) { return skb_peek_tail(&sk->sk_write_queue); } #define tcp_for_write_queue_from_safe(skb, tmp, sk) \ skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp) static inline struct sk_buff *tcp_send_head(const struct sock *sk) { return skb_peek(&sk->sk_write_queue); } static inline bool tcp_skb_is_last(const struct sock *sk, const struct sk_buff *skb) { return skb_queue_is_last(&sk->sk_write_queue, skb); } /** * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue * @sk: socket * * Since the write queue can have a temporary empty skb in it, * we must not use "return skb_queue_empty(&sk->sk_write_queue)" */ static inline bool tcp_write_queue_empty(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return tp->write_seq == tp->snd_nxt; } static inline bool tcp_rtx_queue_empty(const struct sock *sk) { return RB_EMPTY_ROOT(&sk->tcp_rtx_queue); } static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk) { return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk); } static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) { __skb_queue_tail(&sk->sk_write_queue, skb); /* Queue it, remembering where we must start sending. */ if (sk->sk_write_queue.next == skb) tcp_chrono_start(sk, TCP_CHRONO_BUSY); } /* Insert new before skb on the write queue of sk. */ static inline void tcp_insert_write_queue_before(struct sk_buff *new, struct sk_buff *skb, struct sock *sk) { __skb_queue_before(&sk->sk_write_queue, skb, new); } static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { tcp_skb_tsorted_anchor_cleanup(skb); __skb_unlink(skb, &sk->sk_write_queue); } void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb); static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk) { tcp_skb_tsorted_anchor_cleanup(skb); rb_erase(&skb->rbnode, &sk->tcp_rtx_queue); } static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk) { list_del(&skb->tcp_tsorted_anchor); tcp_rtx_queue_unlink(skb, sk); tcp_wmem_free_skb(sk, skb); } static inline void tcp_write_collapse_fence(struct sock *sk) { struct sk_buff *skb = tcp_write_queue_tail(sk); if (skb) TCP_SKB_CB(skb)->eor = 1; } static inline void tcp_push_pending_frames(struct sock *sk) { if (tcp_send_head(sk)) { struct tcp_sock *tp = tcp_sk(sk); __tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle); } } /* Start sequence of the skb just after the highest skb with SACKed * bit, valid only if sacked_out > 0 or when the caller has ensured * validity by itself. */ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp) { if (!tp->sacked_out) return tp->snd_una; if (tp->highest_sack == NULL) return tp->snd_nxt; return TCP_SKB_CB(tp->highest_sack)->seq; } static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb) { tcp_sk(sk)->highest_sack = skb_rb_next(skb); } static inline struct sk_buff *tcp_highest_sack(struct sock *sk) { return tcp_sk(sk)->highest_sack; } static inline void tcp_highest_sack_reset(struct sock *sk) { tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk); } /* Called when old skb is about to be deleted and replaced by new skb */ static inline void tcp_highest_sack_replace(struct sock *sk, struct sk_buff *old, struct sk_buff *new) { if (old == tcp_highest_sack(sk)) tcp_sk(sk)->highest_sack = new; } /* This helper checks if socket has IP_TRANSPARENT set */ static inline bool inet_sk_transparent(const struct sock *sk) { switch (sk->sk_state) { case TCP_TIME_WAIT: return inet_twsk(sk)->tw_transparent; case TCP_NEW_SYN_RECV: return inet_rsk(inet_reqsk(sk))->no_srccheck; } return inet_test_bit(TRANSPARENT, sk); } /* Determines whether this is a thin stream (which may suffer from * increased latency). Used to trigger latency-reducing mechanisms. */ static inline bool tcp_stream_is_thin(struct tcp_sock *tp) { return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp); } /* /proc */ enum tcp_seq_states { TCP_SEQ_STATE_LISTENING, TCP_SEQ_STATE_ESTABLISHED, }; void *tcp_seq_start(struct seq_file *seq, loff_t *pos); void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos); void tcp_seq_stop(struct seq_file *seq, void *v); struct tcp_seq_afinfo { sa_family_t family; }; struct tcp_iter_state { struct seq_net_private p; enum tcp_seq_states state; struct sock *syn_wait_sk; int bucket, offset, sbucket, num; loff_t last_pos; }; extern struct request_sock_ops tcp_request_sock_ops; extern struct request_sock_ops tcp6_request_sock_ops; void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb); struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th); struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, struct tcphdr *th); INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)); #ifdef CONFIG_INET void tcp_gro_complete(struct sk_buff *skb); #else static inline void tcp_gro_complete(struct sk_buff *skb) { } #endif void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); u32 val; val = READ_ONCE(tp->notsent_lowat); return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } bool tcp_stream_memory_free(const struct sock *sk, int wake); #ifdef CONFIG_PROC_FS int tcp4_proc_init(void); void tcp4_proc_exit(void); #endif int tcp_rtx_synack(const struct sock *sk, struct request_sock *req); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb); /* TCP af-specific functions */ struct tcp_sock_af_ops { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*md5_lookup) (const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash)(char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); int (*md5_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen); #endif #ifdef CONFIG_TCP_AO int (*ao_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen); struct tcp_ao_key *(*ao_lookup)(const struct sock *sk, struct sock *addr_sk, int sndid, int rcvid); int (*ao_calc_key_sk)(struct tcp_ao_key *mkt, u8 *key, const struct sock *sk, __be32 sisn, __be32 disn, bool send); int (*calc_ao_hash)(char *location, struct tcp_ao_key *ao, const struct sock *sk, const struct sk_buff *skb, const u8 *tkey, int hash_offset, u32 sne); #endif }; struct tcp_request_sock_ops { u16 mss_clamp; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash) (char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); #endif #ifdef CONFIG_TCP_AO struct tcp_ao_key *(*ao_lookup)(const struct sock *sk, struct request_sock *req, int sndid, int rcvid); int (*ao_calc_key)(struct tcp_ao_key *mkt, u8 *key, struct request_sock *sk); int (*ao_synack_hash)(char *ao_hash, struct tcp_ao_key *mkt, struct request_sock *req, const struct sk_buff *skb, int hash_offset, u32 sne); #endif #ifdef CONFIG_SYN_COOKIES __u32 (*cookie_init_seq)(const struct sk_buff *skb, __u16 *mss); #endif struct dst_entry *(*route_req)(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req, u32 tw_isn); u32 (*init_seq)(const struct sk_buff *skb); u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb); }; extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; #if IS_ENABLED(CONFIG_IPV6) extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; #endif #ifdef CONFIG_SYN_COOKIES static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, __u16 *mss) { tcp_synq_overflow(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); return ops->cookie_init_seq(skb, mss); } #else static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, __u16 *mss) { return 0; } #endif struct tcp_key { union { struct { struct tcp_ao_key *ao_key; char *traffic_key; u32 sne; u8 rcv_next; }; struct tcp_md5sig_key *md5_key; }; enum { TCP_KEY_NONE = 0, TCP_KEY_MD5, TCP_KEY_AO, } type; }; static inline void tcp_get_current_key(const struct sock *sk, struct tcp_key *out) { #if defined(CONFIG_TCP_AO) || defined(CONFIG_TCP_MD5SIG) const struct tcp_sock *tp = tcp_sk(sk); #endif #ifdef CONFIG_TCP_AO if (static_branch_unlikely(&tcp_ao_needed.key)) { struct tcp_ao_info *ao; ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held(sk)); if (ao) { out->ao_key = READ_ONCE(ao->current_key); out->type = TCP_KEY_AO; return; } } #endif #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed.key) && rcu_access_pointer(tp->md5sig_info)) { out->md5_key = tp->af_specific->md5_lookup(sk, sk); if (out->md5_key) { out->type = TCP_KEY_MD5; return; } } #endif out->type = TCP_KEY_NONE; } static inline bool tcp_key_is_md5(const struct tcp_key *key) { if (static_branch_tcp_md5()) return key->type == TCP_KEY_MD5; return false; } static inline bool tcp_key_is_ao(const struct tcp_key *key) { if (static_branch_tcp_ao()) return key->type == TCP_KEY_AO; return false; } int tcpv4_offload_init(void); void tcp_v4_init(void); void tcp_init(void); /* tcp_recovery.c */ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb); void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced); extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd); extern bool tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); /* tcp_plb.c */ /* * Scaling factor for fractions in PLB. For example, tcp_plb_update_state * expects cong_ratio which represents fraction of traffic that experienced * congestion over a single RTT. In order to avoid floating point operations, * this fraction should be mapped to (1 << TCP_PLB_SCALE) and passed in. */ #define TCP_PLB_SCALE 8 /* State for PLB (Protective Load Balancing) for a single TCP connection. */ struct tcp_plb_state { u8 consec_cong_rounds:5, /* consecutive congested rounds */ unused:3; u32 pause_until; /* jiffies32 when PLB can resume rerouting */ }; static inline void tcp_plb_init(const struct sock *sk, struct tcp_plb_state *plb) { plb->consec_cong_rounds = 0; plb->pause_until = 0; } void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, const int cong_ratio); void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb); void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); static inline void tcp_warn_once(const struct sock *sk, bool cond, const char *str) { WARN_ONCE(cond, "%scwn:%u out:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n", str, tcp_snd_cwnd(tcp_sk(sk)), tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out, tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out, tcp_sk(sk)->tlp_high_seq, sk->sk_state, inet_csk(sk)->icsk_ca_state, tcp_sk(sk)->advmss, tcp_sk(sk)->mss_cache, inet_csk(sk)->icsk_pmtu_cookie); } /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { const struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 rto = inet_csk(sk)->icsk_rto; if (likely(skb)) { u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto); return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; } else { tcp_warn_once(sk, 1, "rtx queue empty: "); return jiffies_to_usecs(rto); } } /* * Save and compile IPv4 options, return a pointer to it */ static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net, struct sk_buff *skb) { const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options_rcu *dopt = NULL; if (opt->optlen) { int opt_size = sizeof(*dopt) + opt->optlen; dopt = kmalloc(opt_size, GFP_ATOMIC); if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) { kfree(dopt); dopt = NULL; } } return dopt; } /* locally generated TCP pure ACKs have skb->truesize == 2 * (check tcp_send_ack() in net/ipv4/tcp_output.c ) * This is much faster than dissecting the packet to find out. * (Think of GRE encapsulations, IPv4, IPv6, ...) */ static inline bool skb_is_tcp_pure_ack(const struct sk_buff *skb) { return skb->truesize == 2; } static inline void skb_set_tcp_pure_ack(struct sk_buff *skb) { skb->truesize = 2; } static inline int tcp_inq(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int answ; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { answ = 0; } else if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || before(tp->urg_seq, tp->copied_seq) || !before(tp->urg_seq, tp->rcv_nxt)) { answ = tp->rcv_nxt - tp->copied_seq; /* Subtract 1, if FIN was received */ if (answ && sock_flag(sk, SOCK_DONE)) answ--; } else { answ = tp->urg_seq - tp->copied_seq; } return answ; } int tcp_peek_len(struct socket *sock); static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) { u16 segs_in; segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs); /* We update these fields while other threads might * read them from tcp_get_info() */ WRITE_ONCE(tp->segs_in, tp->segs_in + segs_in); if (skb->len > tcp_hdrlen(skb)) WRITE_ONCE(tp->data_segs_in, tp->data_segs_in + segs_in); } /* * TCP listen path runs lockless. * We forced "struct sock" to be const qualified to make sure * we don't modify one of its field by mistake. * Here, we increment sk_drops which is an atomic_t, so we can safely * make sock writable again. */ static inline void tcp_listendrop(const struct sock *sk) { atomic_inc(&((struct sock *)sk)->sk_drops); __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); } enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer); /* * Interface for adding Upper Level Protocols over TCP */ #define TCP_ULP_NAME_MAX 16 #define TCP_ULP_MAX 128 #define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX) struct tcp_ulp_ops { struct list_head list; /* initialize ulp */ int (*init)(struct sock *sk); /* update ulp */ void (*update)(struct sock *sk, struct proto *p, void (*write_space)(struct sock *sk)); /* cleanup ulp */ void (*release)(struct sock *sk); /* diagnostic */ int (*get_info)(struct sock *sk, struct sk_buff *skb); size_t (*get_info_size)(const struct sock *sk); /* clone ulp */ void (*clone)(const struct request_sock *req, struct sock *newsk, const gfp_t priority); char name[TCP_ULP_NAME_MAX]; struct module *owner; }; int tcp_register_ulp(struct tcp_ulp_ops *type); void tcp_unregister_ulp(struct tcp_ulp_ops *type); int tcp_set_ulp(struct sock *sk, const char *name); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); void tcp_update_ulp(struct sock *sk, struct proto *p, void (*write_space)(struct sock *sk)); #define MODULE_ALIAS_TCP_ULP(name) \ __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) #ifdef CONFIG_NET_SOCK_MSG struct sk_msg; struct sk_psock; #ifdef CONFIG_BPF_SYSCALL int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ #ifdef CONFIG_INET void tcp_eat_skb(struct sock *sk, struct sk_buff *skb); #else static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) { } #endif int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, struct sk_msg *msg, u32 bytes, int flags); #endif /* CONFIG_NET_SOCK_MSG */ #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) { } #endif #ifdef CONFIG_CGROUP_BPF static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, unsigned int end_offset) { skops->skb = skb; skops->skb_data_end = skb->data + end_offset; } #else static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, unsigned int end_offset) { } #endif /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF * program loaded). */ #ifdef CONFIG_BPF static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { struct bpf_sock_ops_kern sock_ops; int ret; memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; sock_owned_by_me(sk); } sock_ops.sk = sk; sock_ops.op = op; if (nargs > 0) memcpy(sock_ops.args, args, nargs * sizeof(*args)); ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); if (ret == 0) ret = sock_ops.reply; else ret = -1; return ret; } static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) { u32 args[2] = {arg1, arg2}; return tcp_call_bpf(sk, op, 2, args); } static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, u32 arg3) { u32 args[3] = {arg1, arg2, arg3}; return tcp_call_bpf(sk, op, 3, args); } #else static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { return -EPERM; } static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) { return -EPERM; } static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, u32 arg3) { return -EPERM; } #endif static inline u32 tcp_timeout_init(struct sock *sk) { int timeout; timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL); if (timeout <= 0) timeout = TCP_TIMEOUT_INIT; return min_t(int, timeout, TCP_RTO_MAX); } static inline u32 tcp_rwnd_init_bpf(struct sock *sk) { int rwnd; rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL); if (rwnd < 0) rwnd = 0; return rwnd; } static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) { return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt) { if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG)) tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt); } #if IS_ENABLED(CONFIG_SMC) extern struct static_key_false tcp_have_smc; #endif #if IS_ENABLED(CONFIG_TLS_DEVICE) void clean_acked_data_enable(struct inet_connection_sock *icsk, void (*cad)(struct sock *sk, u32 ack_seq)); void clean_acked_data_disable(struct inet_connection_sock *icsk); void clean_acked_data_flush(void); #endif DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); static inline void tcp_add_tx_delay(struct sk_buff *skb, const struct tcp_sock *tp) { if (static_branch_unlikely(&tcp_tx_delay_enabled)) skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC; } /* Compute Earliest Departure Time for some control packets * like ACK or RST for TIME_WAIT or non ESTABLISHED sockets. */ static inline u64 tcp_transmit_time(const struct sock *sk) { if (static_branch_unlikely(&tcp_tx_delay_enabled)) { u32 delay = (sk->sk_state == TCP_TIME_WAIT) ? tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay; return tcp_clock_ns() + (u64)delay * NSEC_PER_USEC; } return 0; } static inline int tcp_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const struct tcp_ao_hdr **aoh) { const u8 *md5_tmp, *ao_tmp; int ret; ret = tcp_do_parse_auth_options(th, &md5_tmp, &ao_tmp); if (ret) return ret; if (md5_hash) *md5_hash = md5_tmp; if (aoh) { if (!ao_tmp) *aoh = NULL; else *aoh = (struct tcp_ao_hdr *)(ao_tmp - 2); } return 0; } static inline bool tcp_ao_required(struct sock *sk, const void *saddr, int family, int l3index, bool stat_inc) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; struct tcp_ao_key *ao_key; if (!static_branch_unlikely(&tcp_ao_needed.key)) return false; ao_info = rcu_dereference_check(tcp_sk(sk)->ao_info, lockdep_sock_is_held(sk)); if (!ao_info) return false; ao_key = tcp_ao_do_lookup(sk, l3index, saddr, family, -1, -1); if (ao_info->ao_required || ao_key) { if (stat_inc) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOREQUIRED); atomic64_inc(&ao_info->counters.ao_required); } return true; } #endif return false; } enum skb_drop_reason tcp_inbound_hash(struct sock *sk, const struct request_sock *req, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int dif, int sdif); #endif /* _TCP_H */
2 468 468 2 467 2300 2046 467 463 2 468 10 11 11 466 19 466 451 454 463 469 13 2299 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 // SPDX-License-Identifier: GPL-2.0 /* * A fast, small, non-recursive O(n log n) sort for the Linux kernel * * This performs n*log2(n) + 0.37*n + o(n) comparisons on average, * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case. * * Quicksort manages n*log2(n) - 1.26*n for random inputs (1.63*n * better) at the expense of stack usage and much larger code to avoid * quicksort's O(n^2) worst case. */ #include <linux/types.h> #include <linux/export.h> #include <linux/sort.h> /** * is_aligned - is this pointer & size okay for word-wide copying? * @base: pointer to data * @size: size of each element * @align: required alignment (typically 4 or 8) * * Returns true if elements can be copied using word loads and stores. * The size must be a multiple of the alignment, and the base address must * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. * * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" * to "if ((a | b) & mask)", so we do that by hand. */ __attribute_const__ __always_inline static bool is_aligned(const void *base, size_t size, unsigned char align) { unsigned char lsbits = (unsigned char)size; (void)base; #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS lsbits |= (unsigned char)(uintptr_t)base; #endif return (lsbits & (align - 1)) == 0; } /** * swap_words_32 - swap two elements in 32-bit chunks * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size (must be a multiple of 4) * * Exchange the two objects in memory. This exploits base+index addressing, * which basically all CPUs have, to minimize loop overhead computations. * * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the * bottom of the loop, even though the zero flag is still valid from the * subtract (since the intervening mov instructions don't alter the flags). * Gcc 8.1.0 doesn't have that problem. */ static void swap_words_32(void *a, void *b, size_t n) { do { u32 t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; } while (n); } /** * swap_words_64 - swap two elements in 64-bit chunks * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size (must be a multiple of 8) * * Exchange the two objects in memory. This exploits base+index * addressing, which basically all CPUs have, to minimize loop overhead * computations. * * We'd like to use 64-bit loads if possible. If they're not, emulating * one requires base+index+4 addressing which x86 has but most other * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, * but it's possible to have 64-bit loads without 64-bit pointers (e.g. * x32 ABI). Are there any cases the kernel needs to worry about? */ static void swap_words_64(void *a, void *b, size_t n) { do { #ifdef CONFIG_64BIT u64 t = *(u64 *)(a + (n -= 8)); *(u64 *)(a + n) = *(u64 *)(b + n); *(u64 *)(b + n) = t; #else /* Use two 32-bit transfers to avoid base+index+4 addressing */ u32 t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; #endif } while (n); } /** * swap_bytes - swap two elements a byte at a time * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size * * This is the fallback if alignment doesn't allow using larger chunks. */ static void swap_bytes(void *a, void *b, size_t n) { do { char t = ((char *)a)[--n]; ((char *)a)[n] = ((char *)b)[n]; ((char *)b)[n] = t; } while (n); } /* * The values are arbitrary as long as they can't be confused with * a pointer, but small integers make for the smallest compare * instructions. */ #define SWAP_WORDS_64 (swap_r_func_t)0 #define SWAP_WORDS_32 (swap_r_func_t)1 #define SWAP_BYTES (swap_r_func_t)2 #define SWAP_WRAPPER (swap_r_func_t)3 struct wrapper { cmp_func_t cmp; swap_func_t swap; }; /* * The function pointer is last to make tail calls most efficient if the * compiler decides not to inline this function. */ static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) { if (swap_func == SWAP_WRAPPER) { ((const struct wrapper *)priv)->swap(a, b, (int)size); return; } if (swap_func == SWAP_WORDS_64) swap_words_64(a, b, size); else if (swap_func == SWAP_WORDS_32) swap_words_32(a, b, size); else if (swap_func == SWAP_BYTES) swap_bytes(a, b, size); else swap_func(a, b, (int)size, priv); } #define _CMP_WRAPPER ((cmp_r_func_t)0L) static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) { if (cmp == _CMP_WRAPPER) return ((const struct wrapper *)priv)->cmp(a, b); return cmp(a, b, priv); } /** * parent - given the offset of the child, find the offset of the parent. * @i: the offset of the heap element whose parent is sought. Non-zero. * @lsbit: a precomputed 1-bit mask, equal to "size & -size" * @size: size of each element * * In terms of array indexes, the parent of element j = @i/@size is simply * (j-1)/2. But when working in byte offsets, we can't use implicit * truncation of integer divides. * * Fortunately, we only need one bit of the quotient, not the full divide. * @size has a least significant bit. That bit will be clear if @i is * an even multiple of @size, and set if it's an odd multiple. * * Logically, we're doing "if (i & lsbit) i -= size;", but since the * branch is unpredictable, it's done with a bit of clever branch-free * code instead. */ __attribute_const__ __always_inline static size_t parent(size_t i, unsigned int lsbit, size_t size) { i -= size; i -= size & -(i & lsbit); return i / 2; } /** * sort_r - sort an array of elements * @base: pointer to data to sort * @num: number of elements * @size: size of each element * @cmp_func: pointer to comparison function * @swap_func: pointer to swap function or NULL * @priv: third argument passed to comparison function * * This function does a heapsort on the given array. You may provide * a swap_func function if you need to do something more than a memory * copy (e.g. fix up pointers or auxiliary data), but the built-in swap * avoids a slow retpoline and so is significantly faster. * * Sorting time is O(n log n) both on average and worst-case. While * quicksort is slightly faster on average, it suffers from exploitable * O(n*n) worst-case behavior and extra memory requirements that make * it less suitable for kernel use. */ void sort_r(void *base, size_t num, size_t size, cmp_r_func_t cmp_func, swap_r_func_t swap_func, const void *priv) { /* pre-scale counters for performance */ size_t n = num * size, a = (num/2) * size; const unsigned int lsbit = size & -size; /* Used to find parent */ size_t shift = 0; if (!a) /* num < 2 || size == 0 */ return; /* called from 'sort' without swap function, let's pick the default */ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) swap_func = NULL; if (!swap_func) { if (is_aligned(base, size, 8)) swap_func = SWAP_WORDS_64; else if (is_aligned(base, size, 4)) swap_func = SWAP_WORDS_32; else swap_func = SWAP_BYTES; } /* * Loop invariants: * 1. elements [a,n) satisfy the heap property (compare greater than * all of their children), * 2. elements [n,num*size) are sorted, and * 3. a <= b <= c <= d <= n (whenever they are valid). */ for (;;) { size_t b, c, d; if (a) /* Building heap: sift down a */ a -= size << shift; else if (n > 3 * size) { /* Sorting: Extract two largest elements */ n -= size; do_swap(base, base + n, size, swap_func, priv); shift = do_cmp(base + size, base + 2 * size, cmp_func, priv) <= 0; a = size << shift; n -= size; do_swap(base + a, base + n, size, swap_func, priv); } else { /* Sort complete */ break; } /* * Sift element at "a" down into heap. This is the * "bottom-up" variant, which significantly reduces * calls to cmp_func(): we find the sift-down path all * the way to the leaves (one compare per level), then * backtrack to find where to insert the target element. * * Because elements tend to sift down close to the leaves, * this uses fewer compares than doing two per level * on the way down. (A bit more than half as many on * average, 3/4 worst-case.) */ for (b = a; c = 2*b + size, (d = c + size) < n;) b = do_cmp(base + c, base + d, cmp_func, priv) > 0 ? c : d; if (d == n) /* Special case last leaf with no sibling */ b = c; /* Now backtrack from "b" to the correct location for "a" */ while (b != a && do_cmp(base + a, base + b, cmp_func, priv) >= 0) b = parent(b, lsbit, size); c = b; /* Where "a" belongs */ while (b != a) { /* Shift it into place */ b = parent(b, lsbit, size); do_swap(base + b, base + c, size, swap_func, priv); } } n -= size; do_swap(base, base + n, size, swap_func, priv); if (n == size * 2 && do_cmp(base, base + size, cmp_func, priv) > 0) do_swap(base, base + size, size, swap_func, priv); } EXPORT_SYMBOL(sort_r); void sort(void *base, size_t num, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) { struct wrapper w = { .cmp = cmp_func, .swap = swap_func, }; return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); } EXPORT_SYMBOL(sort);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 // SPDX-License-Identifier: GPL-2.0 /* * HugeTLB Vmemmap Optimization (HVO) * * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> */ #ifndef _LINUX_HUGETLB_VMEMMAP_H #define _LINUX_HUGETLB_VMEMMAP_H #include <linux/hugetlb.h> /* * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See * Documentation/mm/vmemmap_dedup.rst. */ #define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE #define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio); long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios); void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio); void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { return pages_per_huge_page(h) * sizeof(struct page); } /* * Return how many vmemmap size associated with a HugeTLB page that can be * optimized and can be freed to the buddy allocator. */ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE; if (!is_power_of_2(sizeof(struct page))) return 0; return size > 0 ? size : 0; } #else static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return 0; } static long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { list_splice_init(folio_list, non_hvo_folios); return 0; } static inline void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { } static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { } static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { return 0; } #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h) { return hugetlb_vmemmap_optimizable_size(h) != 0; } #endif /* _LINUX_HUGETLB_VMEMMAP_H */
4 6 6 2 1 4 2 4 4 4 5 2 1 2 4 1 4 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/gen_stats.h> #include <linux/jhash.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/slab.h> #include <net/gen_stats.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_RATEEST.h> #include <net/netfilter/xt_rateest.h> #define RATEEST_HSIZE 16 struct xt_rateest_net { struct mutex hash_lock; struct hlist_head hash[RATEEST_HSIZE]; }; static unsigned int xt_rateest_id; static unsigned int jhash_rnd __read_mostly; static unsigned int xt_rateest_hash(const char *name) { return jhash(name, sizeof_field(struct xt_rateest, name), jhash_rnd) & (RATEEST_HSIZE - 1); } static void xt_rateest_hash_insert(struct xt_rateest_net *xn, struct xt_rateest *est) { unsigned int h; h = xt_rateest_hash(est->name); hlist_add_head(&est->list, &xn->hash[h]); } static struct xt_rateest *__xt_rateest_lookup(struct xt_rateest_net *xn, const char *name) { struct xt_rateest *est; unsigned int h; h = xt_rateest_hash(name); hlist_for_each_entry(est, &xn->hash[h], list) { if (strcmp(est->name, name) == 0) { est->refcnt++; return est; } } return NULL; } struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); struct xt_rateest *est; mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, name); mutex_unlock(&xn->hash_lock); return est; } EXPORT_SYMBOL_GPL(xt_rateest_lookup); void xt_rateest_put(struct net *net, struct xt_rateest *est) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); mutex_lock(&xn->hash_lock); if (--est->refcnt == 0) { hlist_del(&est->list); gen_kill_estimator(&est->rate_est); /* * gen_estimator est_timer() might access est->lock or bstats, * wait a RCU grace period before freeing 'est' */ kfree_rcu(est, rcu); } mutex_unlock(&xn->hash_lock); } EXPORT_SYMBOL_GPL(xt_rateest_put); static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_rateest_target_info *info = par->targinfo; struct gnet_stats_basic_sync *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); u64_stats_add(&stats->bytes, skb->len); u64_stats_inc(&stats->packets); spin_unlock_bh(&info->est->lock); return XT_CONTINUE; } static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) { struct xt_rateest_net *xn = net_generic(par->net, xt_rateest_id); struct xt_rateest_target_info *info = par->targinfo; struct xt_rateest *est; struct { struct nlattr opt; struct gnet_estimator est; } cfg; int ret; if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) return -ENAMETOOLONG; net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, info->name); if (est) { mutex_unlock(&xn->hash_lock); /* * If estimator parameters are specified, they must match the * existing estimator. */ if ((!info->interval && !info->ewma_log) || (info->interval != est->params.interval || info->ewma_log != est->params.ewma_log)) { xt_rateest_put(par->net, est); return -EINVAL; } info->est = est; return 0; } ret = -ENOMEM; est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) goto err1; gnet_stats_basic_sync_init(&est->bstats); strscpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; est->params.interval = info->interval; est->params.ewma_log = info->ewma_log; cfg.opt.nla_len = nla_attr_size(sizeof(cfg.est)); cfg.opt.nla_type = TCA_STATS_RATE_EST; cfg.est.interval = info->interval; cfg.est.ewma_log = info->ewma_log; ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est, &est->lock, NULL, &cfg.opt); if (ret < 0) goto err2; info->est = est; xt_rateest_hash_insert(xn, est); mutex_unlock(&xn->hash_lock); return 0; err2: kfree(est); err1: mutex_unlock(&xn->hash_lock); return ret; } static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) { struct xt_rateest_target_info *info = par->targinfo; xt_rateest_put(par->net, info->est); } static struct xt_target xt_rateest_tg_reg[] __read_mostly = { { .name = "RATEEST", .revision = 0, .family = NFPROTO_IPV4, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "RATEEST", .revision = 0, .family = NFPROTO_IPV6, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }, #endif }; static __net_init int xt_rateest_net_init(struct net *net) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); int i; mutex_init(&xn->hash_lock); for (i = 0; i < ARRAY_SIZE(xn->hash); i++) INIT_HLIST_HEAD(&xn->hash[i]); return 0; } static struct pernet_operations xt_rateest_net_ops = { .init = xt_rateest_net_init, .id = &xt_rateest_id, .size = sizeof(struct xt_rateest_net), }; static int __init xt_rateest_tg_init(void) { int err = register_pernet_subsys(&xt_rateest_net_ops); if (err) return err; return xt_register_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); } static void __exit xt_rateest_tg_fini(void) { xt_unregister_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); unregister_pernet_subsys(&xt_rateest_net_ops); } MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: packet rate estimator"); MODULE_ALIAS("ipt_RATEEST"); MODULE_ALIAS("ip6t_RATEEST"); module_init(xt_rateest_tg_init); module_exit(xt_rateest_tg_fini);
1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 7 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Locality-Based Least-Connection with Replication scheduler * * Authors: Wensong Zhang <wensong@gnuchina.org> * * Changes: * Julian Anastasov : Added the missing (dest->weight>0) * condition in the ip_vs_dest_set_max. */ /* * The lblc/r algorithm is as follows (pseudo code): * * if serverSet[dest_ip] is null then * n, serverSet[dest_ip] <- {weighted least-conn node}; * else * n <- {least-conn (alive) node in serverSet[dest_ip]}; * if (n is null) OR * (n.conns>n.weight AND * there is a node m with m.conns<m.weight/2) then * n <- {weighted least-conn node}; * add n to serverSet[dest_ip]; * if |serverSet[dest_ip]| > 1 AND * now - serverSet[dest_ip].lastMod > T then * m <- {most conn node in serverSet[dest_ip]}; * remove m from serverSet[dest_ip]; * if serverSet[dest_ip] changed then * serverSet[dest_ip].lastMod <- now; * * return n; * */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/jiffies.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/hash.h> /* for sysctl */ #include <linux/fs.h> #include <linux/sysctl.h> #include <net/net_namespace.h> #include <net/ip_vs.h> /* * It is for garbage collection of stale IPVS lblcr entries, * when the table is full. */ #define CHECK_EXPIRE_INTERVAL (60*HZ) #define ENTRY_TIMEOUT (6*60*HZ) #define DEFAULT_EXPIRATION (24*60*60*HZ) /* * It is for full expiration check. * When there is no partial expiration check (garbage collection) * in a half hour, do a full expiration check to collect stale * entries that haven't been touched for a day. */ #define COUNT_FOR_FULL_EXPIRATION 30 /* * for IPVS lblcr entry hash table */ #ifndef CONFIG_IP_VS_LBLCR_TAB_BITS #define CONFIG_IP_VS_LBLCR_TAB_BITS 10 #endif #define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS #define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) #define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) /* * IPVS destination set structure and operations */ struct ip_vs_dest_set_elem { struct list_head list; /* list link */ struct ip_vs_dest *dest; /* destination server */ struct rcu_head rcu_head; }; struct ip_vs_dest_set { atomic_t size; /* set size */ unsigned long lastmod; /* last modified time */ struct list_head list; /* destination list */ }; static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest, bool check) { struct ip_vs_dest_set_elem *e; if (check) { list_for_each_entry(e, &set->list, list) { if (e->dest == dest) return; } } e = kmalloc(sizeof(*e), GFP_ATOMIC); if (e == NULL) return; ip_vs_dest_hold(dest); e->dest = dest; list_add_rcu(&e->list, &set->list); atomic_inc(&set->size); set->lastmod = jiffies; } static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head) { struct ip_vs_dest_set_elem *e; e = container_of(head, struct ip_vs_dest_set_elem, rcu_head); ip_vs_dest_put_and_free(e->dest); kfree(e); } static void ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) { struct ip_vs_dest_set_elem *e; list_for_each_entry(e, &set->list, list) { if (e->dest == dest) { /* HIT */ atomic_dec(&set->size); set->lastmod = jiffies; list_del_rcu(&e->list); call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); break; } } } static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) { struct ip_vs_dest_set_elem *e, *ep; list_for_each_entry_safe(e, ep, &set->list, list) { list_del_rcu(&e->list); call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free); } } /* get weighted least-connection node in the destination set */ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) { struct ip_vs_dest_set_elem *e; struct ip_vs_dest *dest, *least; int loh, doh; /* select the first destination server, whose weight > 0 */ list_for_each_entry_rcu(e, &set->list, list) { least = e->dest; if (least->flags & IP_VS_DEST_F_OVERLOAD) continue; if ((atomic_read(&least->weight) > 0) && (least->flags & IP_VS_DEST_F_AVAILABLE)) { loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } return NULL; /* find the destination with the weighted least load */ nextstage: list_for_each_entry_continue_rcu(e, &set->list, list) { dest = e->dest; if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); if (((__s64)loh * atomic_read(&dest->weight) > (__s64)doh * atomic_read(&least->weight)) && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { least = dest; loh = doh; } } IP_VS_DBG_BUF(6, "%s(): server %s:%d " "activeconns %d refcnt %d weight %d overhead %d\n", __func__, IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; } /* get weighted most-connection node in the destination set */ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) { struct ip_vs_dest_set_elem *e; struct ip_vs_dest *dest, *most; int moh, doh; if (set == NULL) return NULL; /* select the first destination server, whose weight > 0 */ list_for_each_entry(e, &set->list, list) { most = e->dest; if (atomic_read(&most->weight) > 0) { moh = ip_vs_dest_conn_overhead(most); goto nextstage; } } return NULL; /* find the destination with the weighted most load */ nextstage: list_for_each_entry_continue(e, &set->list, list) { dest = e->dest; doh = ip_vs_dest_conn_overhead(dest); /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ if (((__s64)moh * atomic_read(&dest->weight) < (__s64)doh * atomic_read(&most->weight)) && (atomic_read(&dest->weight) > 0)) { most = dest; moh = doh; } } IP_VS_DBG_BUF(6, "%s(): server %s:%d " "activeconns %d refcnt %d weight %d overhead %d\n", __func__, IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port), atomic_read(&most->activeconns), refcount_read(&most->refcnt), atomic_read(&most->weight), moh); return most; } /* * IPVS lblcr entry represents an association between destination * IP address and its destination server set */ struct ip_vs_lblcr_entry { struct hlist_node list; int af; /* address family */ union nf_inet_addr addr; /* destination IP address */ struct ip_vs_dest_set set; /* destination server set */ unsigned long lastuse; /* last used time */ struct rcu_head rcu_head; }; /* * IPVS lblcr hash table */ struct ip_vs_lblcr_table { struct rcu_head rcu_head; struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ atomic_t entries; /* number of entries */ int max_size; /* maximum size of entries */ struct timer_list periodic_timer; /* collect stale entries */ struct ip_vs_service *svc; /* pointer back to service */ int rover; /* rover for expire check */ int counter; /* counter for no expire */ bool dead; }; #ifdef CONFIG_SYSCTL /* * IPVS LBLCR sysctl table */ static struct ctl_table vs_vars_table[] = { { .procname = "lblcr_expiration", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; #endif static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) { hlist_del_rcu(&en->list); ip_vs_dest_set_eraseall(&en->set); kfree_rcu(en, rcu_head); } /* * Returns hash value for IPVS LBLCR entry */ static inline unsigned int ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS); } /* * Hash an entry in the ip_vs_lblcr_table. * returns bool success. */ static void ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) { unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr); hlist_add_head_rcu(&en->list, &tbl->bucket[hash]); atomic_inc(&tbl->entries); } /* Get ip_vs_lblcr_entry associated with supplied parameters. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *addr) { unsigned int hash = ip_vs_lblcr_hashkey(af, addr); struct ip_vs_lblcr_entry *en; hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list) if (ip_vs_addr_equal(af, &en->addr, addr)) return en; return NULL; } /* * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination * IP address to a server. Called under spin lock. */ static inline struct ip_vs_lblcr_entry * ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr, u16 af, struct ip_vs_dest *dest) { struct ip_vs_lblcr_entry *en; en = ip_vs_lblcr_get(af, tbl, daddr); if (!en) { en = kmalloc(sizeof(*en), GFP_ATOMIC); if (!en) return NULL; en->af = af; ip_vs_addr_copy(af, &en->addr, daddr); en->lastuse = jiffies; /* initialize its dest set */ atomic_set(&(en->set.size), 0); INIT_LIST_HEAD(&en->set.list); ip_vs_dest_set_insert(&en->set, dest, false); ip_vs_lblcr_hash(tbl, en); return en; } ip_vs_dest_set_insert(&en->set, dest, true); return en; } /* * Flush all the entries of the specified table. */ static void ip_vs_lblcr_flush(struct ip_vs_service *svc) { struct ip_vs_lblcr_table *tbl = svc->sched_data; int i; struct ip_vs_lblcr_entry *en; struct hlist_node *next; spin_lock_bh(&svc->sched_lock); tbl->dead = true; for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblcr_free(en); } } spin_unlock_bh(&svc->sched_lock); } static int sysctl_lblcr_expiration(struct ip_vs_service *svc) { #ifdef CONFIG_SYSCTL return svc->ipvs->sysctl_lblcr_expiration; #else return DEFAULT_EXPIRATION; #endif } static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) { struct ip_vs_lblcr_table *tbl = svc->sched_data; unsigned long now = jiffies; int i, j; struct ip_vs_lblcr_entry *en; struct hlist_node *next; for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; spin_lock(&svc->sched_lock); hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_after(en->lastuse + sysctl_lblcr_expiration(svc), now)) continue; ip_vs_lblcr_free(en); atomic_dec(&tbl->entries); } spin_unlock(&svc->sched_lock); } tbl->rover = j; } /* * Periodical timer handler for IPVS lblcr table * It is used to collect stale entries when the number of entries * exceeds the maximum size of the table. * * Fixme: we probably need more complicated algorithm to collect * entries that have not been used for a long time even * if the number of entries doesn't exceed the maximum size * of the table. * The full expiration check is for this purpose now. */ static void ip_vs_lblcr_check_expire(struct timer_list *t) { struct ip_vs_lblcr_table *tbl = from_timer(tbl, t, periodic_timer); struct ip_vs_service *svc = tbl->svc; unsigned long now = jiffies; int goal; int i, j; struct ip_vs_lblcr_entry *en; struct hlist_node *next; if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { /* do full expiration check */ ip_vs_lblcr_full_check(svc); tbl->counter = 1; goto out; } if (atomic_read(&tbl->entries) <= tbl->max_size) { tbl->counter++; goto out; } goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; if (goal > tbl->max_size/2) goal = tbl->max_size/2; for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; spin_lock(&svc->sched_lock); hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) { if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) continue; ip_vs_lblcr_free(en); atomic_dec(&tbl->entries); goal--; } spin_unlock(&svc->sched_lock); if (goal <= 0) break; } tbl->rover = j; out: mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); } static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) { int i; struct ip_vs_lblcr_table *tbl; /* * Allocate the ip_vs_lblcr_table for this service */ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; svc->sched_data = tbl; IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) allocated for " "current service\n", sizeof(*tbl)); /* * Initialize the hash buckets */ for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { INIT_HLIST_HEAD(&tbl->bucket[i]); } tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; tbl->dead = false; tbl->svc = svc; atomic_set(&tbl->entries, 0); /* * Hook periodic timer for garbage collection */ timer_setup(&tbl->periodic_timer, ip_vs_lblcr_check_expire, 0); mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); return 0; } static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc) { struct ip_vs_lblcr_table *tbl = svc->sched_data; /* remove periodic timer */ timer_shutdown_sync(&tbl->periodic_timer); /* got to clean up table entries here */ ip_vs_lblcr_flush(svc); /* release the table itself */ kfree_rcu(tbl, rcu_head); IP_VS_DBG(6, "LBLCR hash table (memory=%zdbytes) released\n", sizeof(*tbl)); } static inline struct ip_vs_dest * __ip_vs_lblcr_schedule(struct ip_vs_service *svc) { struct ip_vs_dest *dest, *least; int loh, doh; /* * We use the following formula to estimate the load: * (dest overhead) / dest->weight * * Remember -- no floats in kernel mode!!! * The comparison of h1*w2 > h2*w1 is equivalent to that of * h1/w1 > h2/w2 * if every weight is larger than zero. * * The server with weight=0 is quiesced and will not receive any * new connection. */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; if (atomic_read(&dest->weight) > 0) { least = dest; loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } return NULL; /* * Find the destination with the least load. */ nextstage: list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; doh = ip_vs_dest_conn_overhead(dest); if ((__s64)loh * atomic_read(&dest->weight) > (__s64)doh * atomic_read(&least->weight)) { least = dest; loh = doh; } } IP_VS_DBG_BUF(6, "LBLCR: server %s:%d " "activeconns %d refcnt %d weight %d overhead %d\n", IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; } /* * If this destination server is overloaded and there is a less loaded * server, then return true. */ static inline int is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) { if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { struct ip_vs_dest *d; list_for_each_entry_rcu(d, &svc->destinations, n_list) { if (atomic_read(&d->activeconns)*2 < atomic_read(&d->weight)) { return 1; } } } return 0; } /* * Locality-Based (weighted) Least-Connection scheduling */ static struct ip_vs_dest * ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_lblcr_table *tbl = svc->sched_data; struct ip_vs_dest *dest; struct ip_vs_lblcr_entry *en; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* First look in our cache */ en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr); if (en) { en->lastuse = jiffies; /* Get the least loaded destination */ dest = ip_vs_dest_set_min(&en->set); /* More than one destination + enough time passed by, cleanup */ if (atomic_read(&en->set.size) > 1 && time_after(jiffies, en->set.lastmod + sysctl_lblcr_expiration(svc))) { spin_lock_bh(&svc->sched_lock); if (atomic_read(&en->set.size) > 1) { struct ip_vs_dest *m; m = ip_vs_dest_set_max(&en->set); if (m) ip_vs_dest_set_erase(&en->set, m); } spin_unlock_bh(&svc->sched_lock); } /* If the destination is not overloaded, use it */ if (dest && !is_overloaded(dest, svc)) goto out; /* The cache entry is invalid, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); if (!dest) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } /* Update our cache entry */ spin_lock_bh(&svc->sched_lock); if (!tbl->dead) ip_vs_dest_set_insert(&en->set, dest, true); spin_unlock_bh(&svc->sched_lock); goto out; } /* No cache entry, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); if (!dest) { IP_VS_DBG(1, "no destination available\n"); return NULL; } /* If we fail to create a cache entry, we'll just use the valid dest */ spin_lock_bh(&svc->sched_lock); if (!tbl->dead) ip_vs_lblcr_new(tbl, &iph->daddr, svc->af, dest); spin_unlock_bh(&svc->sched_lock); out: IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } /* * IPVS LBLCR Scheduler structure */ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = { .name = "lblcr", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), .init_service = ip_vs_lblcr_init_svc, .done_service = ip_vs_lblcr_done_svc, .schedule = ip_vs_lblcr_schedule, }; /* * per netns init. */ #ifdef CONFIG_SYSCTL static int __net_init __ip_vs_lblcr_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); size_t vars_table_size = ARRAY_SIZE(vs_vars_table); if (!ipvs) return -ENOENT; if (!net_eq(net, &init_net)) { ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, sizeof(vs_vars_table), GFP_KERNEL); if (ipvs->lblcr_ctl_table == NULL) return -ENOMEM; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) vars_table_size = 0; } else ipvs->lblcr_ctl_table = vs_vars_table; ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION; ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; ipvs->lblcr_ctl_header = register_net_sysctl_sz(net, "net/ipv4/vs", ipvs->lblcr_ctl_table, vars_table_size); if (!ipvs->lblcr_ctl_header) { if (!net_eq(net, &init_net)) kfree(ipvs->lblcr_ctl_table); return -ENOMEM; } return 0; } static void __net_exit __ip_vs_lblcr_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); unregister_net_sysctl_table(ipvs->lblcr_ctl_header); if (!net_eq(net, &init_net)) kfree(ipvs->lblcr_ctl_table); } #else static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; } static void __net_exit __ip_vs_lblcr_exit(struct net *net) { } #endif static struct pernet_operations ip_vs_lblcr_ops = { .init = __ip_vs_lblcr_init, .exit = __ip_vs_lblcr_exit, }; static int __init ip_vs_lblcr_init(void) { int ret; ret = register_pernet_subsys(&ip_vs_lblcr_ops); if (ret) return ret; ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); if (ret) unregister_pernet_subsys(&ip_vs_lblcr_ops); return ret; } static void __exit ip_vs_lblcr_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); unregister_pernet_subsys(&ip_vs_lblcr_ops); rcu_barrier(); } module_init(ip_vs_lblcr_init); module_exit(ip_vs_lblcr_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs locality-based least-connection with replication scheduler");
12 7 2 1 3 11 2094 2096 2106 2097 2099 2093 2095 2105 56 772 6 56 779 781 775 6 77 398 77 2 2 1955 1960 1952 1231 332 1 1 1958 25 11 25 28 8 213 201 36 202 8 213 364 368 224 368 73 29 215 319 1 454 168 321 321 318 320 74 75 1290 4 1290 1290 1293 701 702 1 58 692 713 712 715 716 5 6 7 7 1289 704 3 701 704 1286 7 19999 7 6 7 7 7 19821 19815 19999 19835 7 7 789 790 781 15 781 780 781 775 781 775 781 780 4 1936 28 28 1 21 1 1 4 778 137 781 52 723 724 726 3 779 29 5 29 18 7 7 2 2 2 3 3 3 1 1 1850 1850 164 24 170 16 1984 1982 1330 1325 4 29 29 60 60 59 1 75 708 264 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * * Based on the design of the Berkeley Packet Filter. The new * internal format has been designed by PLUMgrid: * * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com * * Authors: * * Jay Schulist <jschlst@samba.org> * Alexei Starovoitov <ast@plumgrid.com> * Daniel Borkmann <dborkman@redhat.com> * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ #include <uapi/linux/btf.h> #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/prandom.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/objtool.h> #include <linux/overflow.h> #include <linux/rbtree_latch.h> #include <linux/kallsyms.h> #include <linux/rcupdate.h> #include <linux/perf_event.h> #include <linux/extable.h> #include <linux/log2.h> #include <linux/bpf_verifier.h> #include <linux/nodemask.h> #include <linux/nospec.h> #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> #include <linux/execmem.h> #include <asm/barrier.h> #include <linux/unaligned.h> /* Registers */ #define BPF_R0 regs[BPF_REG_0] #define BPF_R1 regs[BPF_REG_1] #define BPF_R2 regs[BPF_REG_2] #define BPF_R3 regs[BPF_REG_3] #define BPF_R4 regs[BPF_REG_4] #define BPF_R5 regs[BPF_REG_5] #define BPF_R6 regs[BPF_REG_6] #define BPF_R7 regs[BPF_REG_7] #define BPF_R8 regs[BPF_REG_8] #define BPF_R9 regs[BPF_REG_9] #define BPF_R10 regs[BPF_REG_10] /* Named registers */ #define DST regs[insn->dst_reg] #define SRC regs[insn->src_reg] #define FP regs[BPF_REG_FP] #define AX regs[BPF_REG_AX] #define ARG1 regs[BPF_REG_ARG1] #define CTX regs[BPF_REG_CTX] #define OFF insn->off #define IMM insn->imm struct bpf_mem_alloc bpf_global_ma; bool bpf_global_ma_set; /* No hurry in this branch * * Exported for the bpf jit load helper. */ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) { u8 *ptr = NULL; if (k >= SKF_NET_OFF) { ptr = skb_network_header(skb) + k - SKF_NET_OFF; } else if (k >= SKF_LL_OFF) { if (unlikely(!skb_mac_header_was_set(skb))) return NULL; ptr = skb_mac_header(skb) + k - SKF_LL_OFF; } if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; return NULL; } /* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */ enum page_size_enum { __PAGE_SIZE = PAGE_SIZE }; struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog_aux *aux; struct bpf_prog *fp; size = round_up(size, __PAGE_SIZE); fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (aux == NULL) { vfree(fp); return NULL; } fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (!fp->active) { vfree(fp); kfree(aux); return NULL; } fp->pages = size / PAGE_SIZE; fp->aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); #ifdef CONFIG_CGROUP_BPF aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID; #endif INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); #ifdef CONFIG_FINEIBT INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode); #endif mutex_init(&fp->aux->used_maps_mutex); mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); return fp; } struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog *prog; int cpu; prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); if (!prog) return NULL; prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); if (!prog->stats) { free_percpu(prog->active); kfree(prog->aux); vfree(prog); return NULL; } for_each_possible_cpu(cpu) { struct bpf_prog_stats *pstats; pstats = per_cpu_ptr(prog->stats, cpu); u64_stats_init(&pstats->syncp); } return prog; } EXPORT_SYMBOL_GPL(bpf_prog_alloc); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) { if (!prog->aux->nr_linfo || !prog->jit_requested) return 0; prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo, sizeof(*prog->aux->jited_linfo), bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN)); if (!prog->aux->jited_linfo) return -ENOMEM; return 0; } void bpf_prog_jit_attempt_done(struct bpf_prog *prog) { if (prog->aux->jited_linfo && (!prog->jited || !prog->aux->jited_linfo[0])) { kvfree(prog->aux->jited_linfo); prog->aux->jited_linfo = NULL; } kfree(prog->aux->kfunc_tab); prog->aux->kfunc_tab = NULL; } /* The jit engine is responsible to provide an array * for insn_off to the jited_off mapping (insn_to_jit_off). * * The idx to this array is the insn_off. Hence, the insn_off * here is relative to the prog itself instead of the main prog. * This array has one entry for each xlated bpf insn. * * jited_off is the byte off to the end of the jited insn. * * Hence, with * insn_start: * The first bpf insn off of the prog. The insn off * here is relative to the main prog. * e.g. if prog is a subprog, insn_start > 0 * linfo_idx: * The prog's idx to prog->aux->linfo and jited_linfo * * jited_linfo[linfo_idx] = prog->bpf_func * * For i > linfo_idx, * * jited_linfo[i] = prog->bpf_func + * insn_to_jit_off[linfo[i].insn_off - insn_start - 1] */ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, const u32 *insn_to_jit_off) { u32 linfo_idx, insn_start, insn_end, nr_linfo, i; const struct bpf_line_info *linfo; void **jited_linfo; if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt) /* Userspace did not provide linfo */ return; linfo_idx = prog->aux->linfo_idx; linfo = &prog->aux->linfo[linfo_idx]; insn_start = linfo[0].insn_off; insn_end = insn_start + prog->len; jited_linfo = &prog->aux->jited_linfo[linfo_idx]; jited_linfo[0] = prog->bpf_func; nr_linfo = prog->aux->nr_linfo - linfo_idx; for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++) /* The verifier ensures that linfo[i].insn_off is * strictly increasing */ jited_linfo[i] = prog->bpf_func + insn_to_jit_off[linfo[i].insn_off - insn_start - 1]; } struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog *fp; u32 pages; size = round_up(size, PAGE_SIZE); pages = size / PAGE_SIZE; if (pages <= fp_old->pages) return fp_old; fp = __vmalloc(size, gfp_flags); if (fp) { memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = pages; fp->aux->prog = fp; /* We keep fp->aux from fp_old around in the new * reallocated structure. */ fp_old->aux = NULL; fp_old->stats = NULL; fp_old->active = NULL; __bpf_prog_free(fp_old); } return fp; } void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); kfree(fp->aux->poke_tab); kfree(fp->aux); } free_percpu(fp->stats); free_percpu(fp->active); vfree(fp); } int bpf_prog_calc_tag(struct bpf_prog *fp) { const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); u32 raw_size = bpf_prog_tag_scratch_size(fp); u32 digest[SHA1_DIGEST_WORDS]; u32 ws[SHA1_WORKSPACE_WORDS]; u32 i, bsize, psize, blocks; struct bpf_insn *dst; bool was_ld_map; u8 *raw, *todo; __be32 *result; __be64 *bits; raw = vmalloc(raw_size); if (!raw) return -ENOMEM; sha1_init(digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation * since they are unstable from user space side. */ dst = (void *)raw; for (i = 0, was_ld_map = false; i < fp->len; i++) { dst[i] = fp->insnsi[i]; if (!was_ld_map && dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && (dst[i].src_reg == BPF_PSEUDO_MAP_FD || dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { was_ld_map = true; dst[i].imm = 0; } else if (was_ld_map && dst[i].code == 0 && dst[i].dst_reg == 0 && dst[i].src_reg == 0 && dst[i].off == 0) { was_ld_map = false; dst[i].imm = 0; } else { was_ld_map = false; } } psize = bpf_prog_insn_size(fp); memset(&raw[psize], 0, raw_size - psize); raw[psize++] = 0x80; bsize = round_up(psize, SHA1_BLOCK_SIZE); blocks = bsize / SHA1_BLOCK_SIZE; todo = raw; if (bsize - psize >= sizeof(__be64)) { bits = (__be64 *)(todo + bsize - sizeof(__be64)); } else { bits = (__be64 *)(todo + bsize + bits_offset); blocks++; } *bits = cpu_to_be64((psize - 1) << 3); while (blocks--) { sha1_transform(digest, todo, ws); todo += SHA1_BLOCK_SIZE; } result = (__force __be32 *)digest; for (i = 0; i < SHA1_DIGEST_WORDS; i++) result[i] = cpu_to_be32(digest[i]); memcpy(fp->tag, result, sizeof(fp->tag)); vfree(raw); return 0; } static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, s32 end_new, s32 curr, const bool probe_pass) { const s64 imm_min = S32_MIN, imm_max = S32_MAX; s32 delta = end_new - end_old; s64 imm = insn->imm; if (curr < pos && curr + imm + 1 >= end_old) imm += delta; else if (curr >= end_new && curr + imm + 1 < end_new) imm -= delta; if (imm < imm_min || imm > imm_max) return -ERANGE; if (!probe_pass) insn->imm = imm; return 0; } static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, s32 end_new, s32 curr, const bool probe_pass) { s64 off_min, off_max, off; s32 delta = end_new - end_old; if (insn->code == (BPF_JMP32 | BPF_JA)) { off = insn->imm; off_min = S32_MIN; off_max = S32_MAX; } else { off = insn->off; off_min = S16_MIN; off_max = S16_MAX; } if (curr < pos && curr + off + 1 >= end_old) off += delta; else if (curr >= end_new && curr + off + 1 < end_new) off -= delta; if (off < off_min || off > off_max) return -ERANGE; if (!probe_pass) { if (insn->code == (BPF_JMP32 | BPF_JA)) insn->imm = off; else insn->off = off; } return 0; } static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, s32 end_new, const bool probe_pass) { u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0); struct bpf_insn *insn = prog->insnsi; int ret = 0; for (i = 0; i < insn_cnt; i++, insn++) { u8 code; /* In the probing pass we still operate on the original, * unpatched image in order to check overflows before we * do any other adjustments. Therefore skip the patchlet. */ if (probe_pass && i == pos) { i = end_new; insn = prog->insnsi + end_old; } if (bpf_pseudo_func(insn)) { ret = bpf_adj_delta_to_imm(insn, pos, end_old, end_new, i, probe_pass); if (ret) return ret; continue; } code = insn->code; if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || BPF_OP(code) == BPF_EXIT) continue; /* Adjust offset of jmps if we cross patch boundaries. */ if (BPF_OP(code) == BPF_CALL) { if (insn->src_reg != BPF_PSEUDO_CALL) continue; ret = bpf_adj_delta_to_imm(insn, pos, end_old, end_new, i, probe_pass); } else { ret = bpf_adj_delta_to_off(insn, pos, end_old, end_new, i, probe_pass); } if (ret) break; } return ret; } static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta) { struct bpf_line_info *linfo; u32 i, nr_linfo; nr_linfo = prog->aux->nr_linfo; if (!nr_linfo || !delta) return; linfo = prog->aux->linfo; for (i = 0; i < nr_linfo; i++) if (off < linfo[i].insn_off) break; /* Push all off < linfo[i].insn_off by delta */ for (; i < nr_linfo; i++) linfo[i].insn_off += delta; } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len) { u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; const u32 cnt_max = S16_MAX; struct bpf_prog *prog_adj; int err; /* Since our patchlet doesn't expand the image, we're done. */ if (insn_delta == 0) { memcpy(prog->insnsi + off, patch, sizeof(*patch)); return prog; } insn_adj_cnt = prog->len + insn_delta; /* Reject anything that would potentially let the insn->off * target overflow when we have excessive program expansions. * We need to probe here before we do any reallocation where * we afterwards may not fail anymore. */ if (insn_adj_cnt > cnt_max && (err = bpf_adj_branches(prog, off, off + 1, off + len, true))) return ERR_PTR(err); /* Several new instructions need to be inserted. Make room * for them. Likely, there's no need for a new allocation as * last page could have large enough tailroom. */ prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), GFP_USER); if (!prog_adj) return ERR_PTR(-ENOMEM); prog_adj->len = insn_adj_cnt; /* Patching happens in 3 steps: * * 1) Move over tail of insnsi from next instruction onwards, * so we can patch the single target insn with one or more * new ones (patching is always from 1 to n insns, n > 0). * 2) Inject new instructions at the target location. * 3) Adjust branch offsets if necessary. */ insn_rest = insn_adj_cnt - off - len; memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1, sizeof(*patch) * insn_rest); memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); /* We are guaranteed to not fail at this point, otherwise * the ship has sailed to reverse to the original state. An * overflow cannot happen at this point. */ BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false)); bpf_adj_linfo(prog_adj, off, insn_delta); return prog_adj; } int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) { int err; /* Branch offsets can't overflow when program is shrinking, no need * to call bpf_adj_branches(..., true) here */ memmove(prog->insnsi + off, prog->insnsi + off + cnt, sizeof(struct bpf_insn) * (prog->len - off - cnt)); prog->len -= cnt; err = bpf_adj_branches(prog, off, off + cnt, off, false); WARN_ON_ONCE(err); return err; } static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; for (i = 0; i < fp->aux->real_func_cnt; i++) bpf_prog_kallsyms_del(fp->aux->func[i]); } void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) { bpf_prog_kallsyms_del_subprogs(fp); bpf_prog_kallsyms_del(fp); } #ifdef CONFIG_BPF_JIT /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; long bpf_jit_limit __read_mostly; long bpf_jit_limit_max __read_mostly; static void bpf_prog_ksym_set_addr(struct bpf_prog *prog) { WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); prog->aux->ksym.start = (unsigned long) prog->bpf_func; prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len; } static void bpf_prog_ksym_set_name(struct bpf_prog *prog) { char *sym = prog->aux->ksym.name; const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; const char *func_name; BUILD_BUG_ON(sizeof("bpf_prog_") + sizeof(prog->tag) * 2 + /* name has been null terminated. * We should need +1 for the '_' preceding * the name. However, the null character * is double counted between the name and the * sizeof("bpf_prog_") above, so we omit * the +1 here. */ sizeof(prog->aux->name) > KSYM_NAME_LEN); sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); /* prog->aux->name will be ignored if full btf name is available */ if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) { type = btf_type_by_id(prog->aux->btf, prog->aux->func_info[prog->aux->func_idx].type_id); func_name = btf_name_by_offset(prog->aux->btf, type->name_off); snprintf(sym, (size_t)(end - sym), "_%s", func_name); return; } if (prog->aux->name[0]) snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); else *sym = 0; } static unsigned long bpf_get_ksym_start(struct latch_tree_node *n) { return container_of(n, struct bpf_ksym, tnode)->start; } static __always_inline bool bpf_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) { return bpf_get_ksym_start(a) < bpf_get_ksym_start(b); } static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) { unsigned long val = (unsigned long)key; const struct bpf_ksym *ksym; ksym = container_of(n, struct bpf_ksym, tnode); if (val < ksym->start) return -1; /* Ensure that we detect return addresses as part of the program, when * the final instruction is a call for a program part of the stack * trace. Therefore, do val > ksym->end instead of val >= ksym->end. */ if (val > ksym->end) return 1; return 0; } static const struct latch_tree_ops bpf_tree_ops = { .less = bpf_tree_less, .comp = bpf_tree_comp, }; static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; void bpf_ksym_add(struct bpf_ksym *ksym) { spin_lock_bh(&bpf_lock); WARN_ON_ONCE(!list_empty(&ksym->lnode)); list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms); latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops); spin_unlock_bh(&bpf_lock); } static void __bpf_ksym_del(struct bpf_ksym *ksym) { if (list_empty(&ksym->lnode)) return; latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops); list_del_rcu(&ksym->lnode); } void bpf_ksym_del(struct bpf_ksym *ksym) { spin_lock_bh(&bpf_lock); __bpf_ksym_del(ksym); spin_unlock_bh(&bpf_lock); } static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) { return fp->jited && !bpf_prog_was_classic(fp); } void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || !bpf_token_capable(fp->aux->token, CAP_BPF)) return; bpf_prog_ksym_set_addr(fp); bpf_prog_ksym_set_name(fp); fp->aux->ksym.prog = true; bpf_ksym_add(&fp->aux->ksym); #ifdef CONFIG_FINEIBT /* * When FineIBT, code in the __cfi_foo() symbols can get executed * and hence unwinder needs help. */ if (cfi_mode != CFI_FINEIBT) return; snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN, "__cfi_%s", fp->aux->ksym.name); fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16; fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func; bpf_ksym_add(&fp->aux->ksym_prefix); #endif } void bpf_prog_kallsyms_del(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp)) return; bpf_ksym_del(&fp->aux->ksym); #ifdef CONFIG_FINEIBT if (cfi_mode != CFI_FINEIBT) return; bpf_ksym_del(&fp->aux->ksym_prefix); #endif } static struct bpf_ksym *bpf_ksym_find(unsigned long addr) { struct latch_tree_node *n; n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { struct bpf_ksym *ksym; int ret = 0; rcu_read_lock(); ksym = bpf_ksym_find(addr); if (ksym) { unsigned long symbol_start = ksym->start; unsigned long symbol_end = ksym->end; ret = strscpy(sym, ksym->name, KSYM_NAME_LEN); if (size) *size = symbol_end - symbol_start; if (off) *off = addr - symbol_start; } rcu_read_unlock(); return ret; } bool is_bpf_text_address(unsigned long addr) { bool ret; rcu_read_lock(); ret = bpf_ksym_find(addr) != NULL; rcu_read_unlock(); return ret; } struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { struct bpf_ksym *ksym = bpf_ksym_find(addr); return ksym && ksym->prog ? container_of(ksym, struct bpf_prog_aux, ksym)->prog : NULL; } const struct exception_table_entry *search_bpf_extables(unsigned long addr) { const struct exception_table_entry *e = NULL; struct bpf_prog *prog; rcu_read_lock(); prog = bpf_prog_ksym_find(addr); if (!prog) goto out; if (!prog->aux->num_exentries) goto out; e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr); out: rcu_read_unlock(); return e; } int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { struct bpf_ksym *ksym; unsigned int it = 0; int ret = -ERANGE; if (!bpf_jit_kallsyms_enabled()) return ret; rcu_read_lock(); list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) { if (it++ != symnum) continue; strscpy(sym, ksym->name, KSYM_NAME_LEN); *value = ksym->start; *type = BPF_SYM_ELF_TYPE; ret = 0; break; } rcu_read_unlock(); return ret; } int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke) { struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; static const u32 poke_tab_max = 1024; u32 slot = prog->aux->size_poke_tab; u32 size = slot + 1; if (size > poke_tab_max) return -ENOSPC; if (poke->tailcall_target || poke->tailcall_target_stable || poke->tailcall_bypass || poke->adj_off || poke->bypass_addr) return -EINVAL; switch (poke->reason) { case BPF_POKE_REASON_TAIL_CALL: if (!poke->tail_call.map) return -EINVAL; break; default: return -EINVAL; } tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL); if (!tab) return -ENOMEM; memcpy(&tab[slot], poke, sizeof(*poke)); prog->aux->size_poke_tab = size; prog->aux->poke_tab = tab; return slot; } /* * BPF program pack allocator. * * Most BPF programs are pretty small. Allocating a hole page for each * program is sometime a waste. Many small bpf program also adds pressure * to instruction TLB. To solve this issue, we introduce a BPF program pack * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86) * to host BPF programs. */ #define BPF_PROG_CHUNK_SHIFT 6 #define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT) #define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1)) struct bpf_prog_pack { struct list_head list; void *ptr; unsigned long bitmap[]; }; void bpf_jit_fill_hole_with_zero(void *area, unsigned int size) { memset(area, 0, size); } #define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) static DEFINE_MUTEX(pack_mutex); static LIST_HEAD(pack_list); /* PMD_SIZE is not available in some special config, e.g. ARCH=arm with * CONFIG_MMU=n. Use PAGE_SIZE in these cases. */ #ifdef PMD_SIZE /* PMD_SIZE is really big for some archs. It doesn't make sense to * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be * greater than or equal to 2MB. */ #define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes()) #else #define BPF_PROG_PACK_SIZE PAGE_SIZE #endif #define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE) static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_prog_pack *pack; int err; pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)), GFP_KERNEL); if (!pack) return NULL; pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE); if (!pack->ptr) goto out; bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE); bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); set_vm_flush_reset_perms(pack->ptr); err = set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); if (err) goto out; list_add_tail(&pack->list, &pack_list); return pack; out: bpf_jit_free_exec(pack->ptr); kfree(pack); return NULL; } void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) { unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); struct bpf_prog_pack *pack; unsigned long pos; void *ptr = NULL; mutex_lock(&pack_mutex); if (size > BPF_PROG_PACK_SIZE) { size = round_up(size, PAGE_SIZE); ptr = bpf_jit_alloc_exec(size); if (ptr) { int err; bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); err = set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); if (err) { bpf_jit_free_exec(ptr); ptr = NULL; } } goto out; } list_for_each_entry(pack, &pack_list, list) { pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, nbits, 0); if (pos < BPF_PROG_CHUNK_COUNT) goto found_free_area; } pack = alloc_new_pack(bpf_fill_ill_insns); if (!pack) goto out; pos = 0; found_free_area: bitmap_set(pack->bitmap, pos, nbits); ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT); out: mutex_unlock(&pack_mutex); return ptr; } void bpf_prog_pack_free(void *ptr, u32 size) { struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; unsigned long pos; mutex_lock(&pack_mutex); if (size > BPF_PROG_PACK_SIZE) { bpf_jit_free_exec(ptr); goto out; } list_for_each_entry(tmp, &pack_list, list) { if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) { pack = tmp; break; } } if (WARN_ONCE(!pack, "bpf_prog_pack bug\n")) goto out; nbits = BPF_PROG_SIZE_TO_NBITS(size); pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT; WARN_ONCE(bpf_arch_text_invalidate(ptr, size), "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); bitmap_clear(pack->bitmap, pos, nbits); if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, BPF_PROG_CHUNK_COUNT, 0) == 0) { list_del(&pack->list); bpf_jit_free_exec(pack->ptr); kfree(pack); } out: mutex_unlock(&pack_mutex); } static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, * dedicated BPF backend memory area, or if neither of the two * below apply. */ u64 __weak bpf_jit_alloc_exec_limit(void) { #if defined(MODULES_VADDR) return MODULES_END - MODULES_VADDR; #else return VMALLOC_END - VMALLOC_START; #endif } static int __init bpf_jit_charge_init(void) { /* Only used as heuristic here to derive limit. */ bpf_jit_limit_max = bpf_jit_alloc_exec_limit(); bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1, PAGE_SIZE), LONG_MAX); return 0; } pure_initcall(bpf_jit_charge_init); int bpf_jit_charge_modmem(u32 size) { if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) { if (!bpf_capable()) { atomic_long_sub(size, &bpf_jit_current); return -EPERM; } } return 0; } void bpf_jit_uncharge_modmem(u32 size) { atomic_long_sub(size, &bpf_jit_current); } void *__weak bpf_jit_alloc_exec(unsigned long size) { return execmem_alloc(EXECMEM_BPF, size); } void __weak bpf_jit_free_exec(void *addr) { execmem_free(addr); } struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); /* Most of BPF filters are really small, but if some of them * fill a page, allow at least 128 extra bytes to insert a * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); if (bpf_jit_charge_modmem(size)) return NULL; hdr = bpf_jit_alloc_exec(size); if (!hdr) { bpf_jit_uncharge_modmem(size); return NULL; } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = get_random_u32_below(hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; return hdr; } void bpf_jit_binary_free(struct bpf_binary_header *hdr) { u32 size = hdr->size; bpf_jit_free_exec(hdr); bpf_jit_uncharge_modmem(size); } /* Allocate jit binary from bpf_prog_pack allocator. * Since the allocated memory is RO+X, the JIT engine cannot write directly * to the memory. To solve this problem, a RW buffer is also allocated at * as the same time. The JIT engine should calculate offsets based on the * RO memory address, but write JITed program to the RW buffer. Once the * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies * the JITed program to the RO memory. */ struct bpf_binary_header * bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, struct bpf_binary_header **rw_header, u8 **rw_image, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *ro_header; u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); /* add 16 bytes for a random section of illegal instructions */ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE); if (bpf_jit_charge_modmem(size)) return NULL; ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns); if (!ro_header) { bpf_jit_uncharge_modmem(size); return NULL; } *rw_header = kvmalloc(size, GFP_KERNEL); if (!*rw_header) { bpf_prog_pack_free(ro_header, size); bpf_jit_uncharge_modmem(size); return NULL; } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(*rw_header, size); (*rw_header)->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); start = get_random_u32_below(hole) & ~(alignment - 1); *image_ptr = &ro_header->image[start]; *rw_image = &(*rw_header)->image[start]; return ro_header; } /* Copy JITed text from rw_header to its final location, the ro_header. */ int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header) { void *ptr; ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size); kvfree(rw_header); if (IS_ERR(ptr)) { bpf_prog_pack_free(ro_header, ro_header->size); return PTR_ERR(ptr); } return 0; } /* bpf_jit_binary_pack_free is called in two different scenarios: * 1) when the program is freed after; * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize). * For case 2), we need to free both the RO memory and the RW buffer. * * bpf_jit_binary_pack_free requires proper ro_header->size. However, * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size * must be set with either bpf_jit_binary_pack_finalize (normal path) or * bpf_arch_text_copy (when jit fails). */ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header) { u32 size = ro_header->size; bpf_prog_pack_free(ro_header, size); kvfree(rw_header); bpf_jit_uncharge_modmem(size); } struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp) { unsigned long real_start = (unsigned long)fp->bpf_func; unsigned long addr; addr = real_start & BPF_PROG_CHUNK_MASK; return (void *)addr; } static inline struct bpf_binary_header * bpf_jit_binary_hdr(const struct bpf_prog *fp) { unsigned long real_start = (unsigned long)fp->bpf_func; unsigned long addr; addr = real_start & PAGE_MASK; return (void *)addr; } /* This symbol is only overridden by archs that have different * requirements than the usual eBPF JITs, f.e. when they only * implement cBPF JIT, do not set images read-only, etc. */ void __weak bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } bpf_prog_unlock_free(fp); } int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed) { s16 off = insn->off; s32 imm = insn->imm; u8 *addr; int err; *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; if (!*func_addr_fixed) { /* Place-holder address till the last pass has collected * all addresses for JITed subprograms in which case we * can pick them up from prog->aux. */ if (!extra_pass) addr = NULL; else if (prog->aux->func && off >= 0 && off < prog->aux->real_func_cnt) addr = (u8 *)prog->aux->func[off]->bpf_func; else return -EINVAL; } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && bpf_jit_supports_far_kfunc_call()) { err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr); if (err) return err; } else { /* Address of a BPF helper call. Since part of the core * kernel, it's always at a fixed location. __bpf_call_base * and the helper with imm relative to it are both in core * kernel. */ addr = (u8 *)__bpf_call_base + imm; } *func_addr = (unsigned long)addr; return 0; } static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff, bool emit_zext) { struct bpf_insn *to = to_buff; u32 imm_rnd = get_random_u32(); s16 off; BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); /* Constraints on AX register: * * AX register is inaccessible from user space. It is mapped in * all JITs, and used here for constant blinding rewrites. It is * typically "stateless" meaning its contents are only valid within * the executed instruction, but not across several instructions. * There are a few exceptions however which are further detailed * below. * * Constant blinding is only used by JITs, not in the interpreter. * The interpreter uses AX in some occasions as a local temporary * register e.g. in DIV or MOD instructions. * * In restricted circumstances, the verifier can also use the AX * register for rewrites as long as they do not interfere with * the above cases! */ if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX) goto out; if (from->imm == 0 && (from->code == (BPF_ALU | BPF_MOV | BPF_K) || from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg); goto out; } switch (from->code) { case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU | BPF_MOV | BPF_K: case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_K: *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off); break; case BPF_ALU64 | BPF_ADD | BPF_K: case BPF_ALU64 | BPF_SUB | BPF_K: case BPF_ALU64 | BPF_AND | BPF_K: case BPF_ALU64 | BPF_OR | BPF_K: case BPF_ALU64 | BPF_XOR | BPF_K: case BPF_ALU64 | BPF_MUL | BPF_K: case BPF_ALU64 | BPF_MOV | BPF_K: case BPF_ALU64 | BPF_DIV | BPF_K: case BPF_ALU64 | BPF_MOD | BPF_K: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off); break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: case BPF_JMP | BPF_JSLE | BPF_K: case BPF_JMP | BPF_JSET | BPF_K: /* Accommodate for extra offset in case of a backjump. */ off = from->off; if (off < 0) off -= 2; *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; case BPF_JMP32 | BPF_JEQ | BPF_K: case BPF_JMP32 | BPF_JNE | BPF_K: case BPF_JMP32 | BPF_JGT | BPF_K: case BPF_JMP32 | BPF_JLT | BPF_K: case BPF_JMP32 | BPF_JGE | BPF_K: case BPF_JMP32 | BPF_JLE | BPF_K: case BPF_JMP32 | BPF_JSGT | BPF_K: case BPF_JMP32 | BPF_JSLT | BPF_K: case BPF_JMP32 | BPF_JSGE | BPF_K: case BPF_JMP32 | BPF_JSLE | BPF_K: case BPF_JMP32 | BPF_JSET | BPF_K: /* Accommodate for extra offset in case of a backjump. */ off = from->off; if (off < 0) off -= 2; *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX); break; case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); if (emit_zext) *to++ = BPF_ZEXT_REG(BPF_REG_AX); *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); break; case BPF_ST | BPF_MEM | BPF_DW: case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off); break; } out: return to - to_buff; } static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, * this still needs to be adapted. */ memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE); } return fp; } static void bpf_prog_clone_free(struct bpf_prog *fp) { /* aux was stolen by the other clone, so we cannot free * it from this path! It will be freed eventually by the * other program on release. * * At this point, we don't need a deferred release since * clone is guaranteed to not be locked. */ fp->aux = NULL; fp->stats = NULL; fp->active = NULL; __bpf_prog_free(fp); } void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) { /* We have to repoint aux->prog to self, as we don't * know whether fp here is the clone or the original. */ fp->aux->prog = fp; bpf_prog_clone_free(fp_other); } struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) { struct bpf_insn insn_buff[16], aux[2]; struct bpf_prog *clone, *tmp; int insn_delta, insn_cnt; struct bpf_insn *insn; int i, rewritten; if (!prog->blinding_requested || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); if (!clone) return ERR_PTR(-ENOMEM); insn_cnt = clone->len; insn = clone->insnsi; for (i = 0; i < insn_cnt; i++, insn++) { if (bpf_pseudo_func(insn)) { /* ld_imm64 with an address of bpf subprog is not * a user controlled constant. Don't randomize it, * since it will conflict with jit_subprogs() logic. */ insn++; i++; continue; } /* We temporarily need to hold the original ld64 insn * so that we can still access the first part in the * second blinding run. */ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) && insn[1].code == 0) memcpy(aux, insn, sizeof(aux)); rewritten = bpf_jit_blind_insn(insn, aux, insn_buff, clone->aux->verifier_zext); if (!rewritten) continue; tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); if (IS_ERR(tmp)) { /* Patching may have repointed aux->prog during * realloc from the original one, so we need to * fix it up here on error. */ bpf_jit_prog_release_other(prog, clone); return tmp; } clone = tmp; insn_delta = rewritten - 1; /* Walk new program and skip insns we just inserted. */ insn = clone->insnsi + i + insn_delta; insn_cnt += insn_delta; i += insn_delta; } clone->blinded = 1; return clone; } #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. This also needs * to go into kallsyms for correlation from e.g. bpftool, so naming * must not change. */ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { return 0; } EXPORT_SYMBOL_GPL(__bpf_call_base); /* All UAPI available opcodes. */ #define BPF_INSN_MAP(INSN_2, INSN_3) \ /* 32 bit ALU operations. */ \ /* Register based. */ \ INSN_3(ALU, ADD, X), \ INSN_3(ALU, SUB, X), \ INSN_3(ALU, AND, X), \ INSN_3(ALU, OR, X), \ INSN_3(ALU, LSH, X), \ INSN_3(ALU, RSH, X), \ INSN_3(ALU, XOR, X), \ INSN_3(ALU, MUL, X), \ INSN_3(ALU, MOV, X), \ INSN_3(ALU, ARSH, X), \ INSN_3(ALU, DIV, X), \ INSN_3(ALU, MOD, X), \ INSN_2(ALU, NEG), \ INSN_3(ALU, END, TO_BE), \ INSN_3(ALU, END, TO_LE), \ /* Immediate based. */ \ INSN_3(ALU, ADD, K), \ INSN_3(ALU, SUB, K), \ INSN_3(ALU, AND, K), \ INSN_3(ALU, OR, K), \ INSN_3(ALU, LSH, K), \ INSN_3(ALU, RSH, K), \ INSN_3(ALU, XOR, K), \ INSN_3(ALU, MUL, K), \ INSN_3(ALU, MOV, K), \ INSN_3(ALU, ARSH, K), \ INSN_3(ALU, DIV, K), \ INSN_3(ALU, MOD, K), \ /* 64 bit ALU operations. */ \ /* Register based. */ \ INSN_3(ALU64, ADD, X), \ INSN_3(ALU64, SUB, X), \ INSN_3(ALU64, AND, X), \ INSN_3(ALU64, OR, X), \ INSN_3(ALU64, LSH, X), \ INSN_3(ALU64, RSH, X), \ INSN_3(ALU64, XOR, X), \ INSN_3(ALU64, MUL, X), \ INSN_3(ALU64, MOV, X), \ INSN_3(ALU64, ARSH, X), \ INSN_3(ALU64, DIV, X), \ INSN_3(ALU64, MOD, X), \ INSN_2(ALU64, NEG), \ INSN_3(ALU64, END, TO_LE), \ /* Immediate based. */ \ INSN_3(ALU64, ADD, K), \ INSN_3(ALU64, SUB, K), \ INSN_3(ALU64, AND, K), \ INSN_3(ALU64, OR, K), \ INSN_3(ALU64, LSH, K), \ INSN_3(ALU64, RSH, K), \ INSN_3(ALU64, XOR, K), \ INSN_3(ALU64, MUL, K), \ INSN_3(ALU64, MOV, K), \ INSN_3(ALU64, ARSH, K), \ INSN_3(ALU64, DIV, K), \ INSN_3(ALU64, MOD, K), \ /* Call instruction. */ \ INSN_2(JMP, CALL), \ /* Exit instruction. */ \ INSN_2(JMP, EXIT), \ /* 32-bit Jump instructions. */ \ /* Register based. */ \ INSN_3(JMP32, JEQ, X), \ INSN_3(JMP32, JNE, X), \ INSN_3(JMP32, JGT, X), \ INSN_3(JMP32, JLT, X), \ INSN_3(JMP32, JGE, X), \ INSN_3(JMP32, JLE, X), \ INSN_3(JMP32, JSGT, X), \ INSN_3(JMP32, JSLT, X), \ INSN_3(JMP32, JSGE, X), \ INSN_3(JMP32, JSLE, X), \ INSN_3(JMP32, JSET, X), \ /* Immediate based. */ \ INSN_3(JMP32, JEQ, K), \ INSN_3(JMP32, JNE, K), \ INSN_3(JMP32, JGT, K), \ INSN_3(JMP32, JLT, K), \ INSN_3(JMP32, JGE, K), \ INSN_3(JMP32, JLE, K), \ INSN_3(JMP32, JSGT, K), \ INSN_3(JMP32, JSLT, K), \ INSN_3(JMP32, JSGE, K), \ INSN_3(JMP32, JSLE, K), \ INSN_3(JMP32, JSET, K), \ /* Jump instructions. */ \ /* Register based. */ \ INSN_3(JMP, JEQ, X), \ INSN_3(JMP, JNE, X), \ INSN_3(JMP, JGT, X), \ INSN_3(JMP, JLT, X), \ INSN_3(JMP, JGE, X), \ INSN_3(JMP, JLE, X), \ INSN_3(JMP, JSGT, X), \ INSN_3(JMP, JSLT, X), \ INSN_3(JMP, JSGE, X), \ INSN_3(JMP, JSLE, X), \ INSN_3(JMP, JSET, X), \ /* Immediate based. */ \ INSN_3(JMP, JEQ, K), \ INSN_3(JMP, JNE, K), \ INSN_3(JMP, JGT, K), \ INSN_3(JMP, JLT, K), \ INSN_3(JMP, JGE, K), \ INSN_3(JMP, JLE, K), \ INSN_3(JMP, JSGT, K), \ INSN_3(JMP, JSLT, K), \ INSN_3(JMP, JSGE, K), \ INSN_3(JMP, JSLE, K), \ INSN_3(JMP, JSET, K), \ INSN_2(JMP, JA), \ INSN_2(JMP32, JA), \ /* Store instructions. */ \ /* Register based. */ \ INSN_3(STX, MEM, B), \ INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ INSN_3(STX, ATOMIC, W), \ INSN_3(STX, ATOMIC, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ INSN_3(ST, MEM, W), \ INSN_3(ST, MEM, DW), \ /* Load instructions. */ \ /* Register based. */ \ INSN_3(LDX, MEM, B), \ INSN_3(LDX, MEM, H), \ INSN_3(LDX, MEM, W), \ INSN_3(LDX, MEM, DW), \ INSN_3(LDX, MEMSX, B), \ INSN_3(LDX, MEMSX, H), \ INSN_3(LDX, MEMSX, W), \ /* Immediate based. */ \ INSN_3(LD, IMM, DW) bool bpf_opcode_in_insntable(u8 code) { #define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true #define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true static const bool public_insntable[256] = { [0 ... 255] = false, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */ [BPF_LD | BPF_ABS | BPF_B] = true, [BPF_LD | BPF_ABS | BPF_H] = true, [BPF_LD | BPF_ABS | BPF_W] = true, [BPF_LD | BPF_IND | BPF_B] = true, [BPF_LD | BPF_IND | BPF_H] = true, [BPF_LD | BPF_IND | BPF_W] = true, [BPF_JMP | BPF_JCOND] = true, }; #undef BPF_INSN_3_TBL #undef BPF_INSN_2_TBL return public_insntable[code]; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * ___bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers * @insn: is the array of eBPF instructions * * Decode and execute eBPF instructions. * * Return: whatever value is in %BPF_R0 at program exit */ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void * const jumptable[256] __annotate_jump_table = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC, [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B, [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW, [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B, [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H, [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W, }; #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) select_insn: goto *jumptable[insn->code]; /* Explicitly mask the register-based shift amounts with 63 or 31 * to avoid undefined behavior. Normally this won't affect the * generated code, for example, in case of native 64 bit archs such * as x86-64 or arm64, the compiler is optimizing the AND away for * the interpreter. In case of JITs, each of the JIT backends compiles * the BPF shift operations to machine instructions which produce * implementation-defined results in such a case; the resulting * contents of the register may be arbitrary, but program behaviour * as a whole remains defined. In other words, in case of JIT backends, * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation. */ /* ALU (shifts) */ #define SHT(OPCODE, OP) \ ALU64_##OPCODE##_X: \ DST = DST OP (SRC & 63); \ CONT; \ ALU_##OPCODE##_X: \ DST = (u32) DST OP ((u32) SRC & 31); \ CONT; \ ALU64_##OPCODE##_K: \ DST = DST OP IMM; \ CONT; \ ALU_##OPCODE##_K: \ DST = (u32) DST OP (u32) IMM; \ CONT; /* ALU (rest) */ #define ALU(OPCODE, OP) \ ALU64_##OPCODE##_X: \ DST = DST OP SRC; \ CONT; \ ALU_##OPCODE##_X: \ DST = (u32) DST OP (u32) SRC; \ CONT; \ ALU64_##OPCODE##_K: \ DST = DST OP IMM; \ CONT; \ ALU_##OPCODE##_K: \ DST = (u32) DST OP (u32) IMM; \ CONT; ALU(ADD, +) ALU(SUB, -) ALU(AND, &) ALU(OR, |) ALU(XOR, ^) ALU(MUL, *) SHT(LSH, <<) SHT(RSH, >>) #undef SHT #undef ALU ALU_NEG: DST = (u32) -DST; CONT; ALU64_NEG: DST = -DST; CONT; ALU_MOV_X: switch (OFF) { case 0: DST = (u32) SRC; break; case 8: DST = (u32)(s8) SRC; break; case 16: DST = (u32)(s16) SRC; break; } CONT; ALU_MOV_K: DST = (u32) IMM; CONT; ALU64_MOV_X: switch (OFF) { case 0: DST = SRC; break; case 8: DST = (s8) SRC; break; case 16: DST = (s16) SRC; break; case 32: DST = (s32) SRC; break; } CONT; ALU64_MOV_K: DST = IMM; CONT; LD_IMM_DW: DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; insn++; CONT; ALU_ARSH_X: DST = (u64) (u32) (((s32) DST) >> (SRC & 31)); CONT; ALU_ARSH_K: DST = (u64) (u32) (((s32) DST) >> IMM); CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= (SRC & 63); CONT; ALU64_ARSH_K: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: switch (OFF) { case 0: div64_u64_rem(DST, SRC, &AX); DST = AX; break; case 1: AX = div64_s64(DST, SRC); DST = DST - AX * SRC; break; } CONT; ALU_MOD_X: switch (OFF) { case 0: AX = (u32) DST; DST = do_div(AX, (u32) SRC); break; case 1: AX = abs((s32)DST); AX = do_div(AX, abs((s32)SRC)); if ((s32)DST < 0) DST = (u32)-AX; else DST = (u32)AX; break; } CONT; ALU64_MOD_K: switch (OFF) { case 0: div64_u64_rem(DST, IMM, &AX); DST = AX; break; case 1: AX = div64_s64(DST, IMM); DST = DST - AX * IMM; break; } CONT; ALU_MOD_K: switch (OFF) { case 0: AX = (u32) DST; DST = do_div(AX, (u32) IMM); break; case 1: AX = abs((s32)DST); AX = do_div(AX, abs((s32)IMM)); if ((s32)DST < 0) DST = (u32)-AX; else DST = (u32)AX; break; } CONT; ALU64_DIV_X: switch (OFF) { case 0: DST = div64_u64(DST, SRC); break; case 1: DST = div64_s64(DST, SRC); break; } CONT; ALU_DIV_X: switch (OFF) { case 0: AX = (u32) DST; do_div(AX, (u32) SRC); DST = (u32) AX; break; case 1: AX = abs((s32)DST); do_div(AX, abs((s32)SRC)); if (((s32)DST < 0) == ((s32)SRC < 0)) DST = (u32)AX; else DST = (u32)-AX; break; } CONT; ALU64_DIV_K: switch (OFF) { case 0: DST = div64_u64(DST, IMM); break; case 1: DST = div64_s64(DST, IMM); break; } CONT; ALU_DIV_K: switch (OFF) { case 0: AX = (u32) DST; do_div(AX, (u32) IMM); DST = (u32) AX; break; case 1: AX = abs((s32)DST); do_div(AX, abs((s32)IMM)); if (((s32)DST < 0) == ((s32)IMM < 0)) DST = (u32)AX; else DST = (u32)-AX; break; } CONT; ALU_END_TO_BE: switch (IMM) { case 16: DST = (__force u16) cpu_to_be16(DST); break; case 32: DST = (__force u32) cpu_to_be32(DST); break; case 64: DST = (__force u64) cpu_to_be64(DST); break; } CONT; ALU_END_TO_LE: switch (IMM) { case 16: DST = (__force u16) cpu_to_le16(DST); break; case 32: DST = (__force u32) cpu_to_le32(DST); break; case 64: DST = (__force u64) cpu_to_le64(DST); break; } CONT; ALU64_END_TO_LE: switch (IMM) { case 16: DST = (__force u16) __swab16(DST); break; case 32: DST = (__force u32) __swab32(DST); break; case 64: DST = (__force u64) __swab64(DST); break; } CONT; /* CALL */ JMP_CALL: /* Function call scratches BPF_R1-BPF_R5 registers, * preserves BPF_R6-BPF_R9, and stores return value * into BPF_R0. */ BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, BPF_R4, BPF_R5); CONT; JMP_CALL_ARGS: BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, BPF_R3, BPF_R4, BPF_R5, insn + insn->off + 1); CONT; JMP_TAIL_CALL: { struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog; u32 index = BPF_R3; if (unlikely(index >= array->map.max_entries)) goto out; if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; prog = READ_ONCE(array->ptrs[index]); if (!prog) goto out; /* ARG1 at this point is guaranteed to point to CTX from * the verifier side due to the fact that the tail call is * handled like a helper, that is, bpf_tail_call_proto, * where arg1_type is ARG_PTR_TO_CTX. */ insn = prog->insnsi; goto select_insn; out: CONT; } JMP_JA: insn += insn->off; CONT; JMP32_JA: insn += insn->imm; CONT; JMP_EXIT: return BPF_R0; /* JMP */ #define COND_JMP(SIGN, OPCODE, CMP_OP) \ JMP_##OPCODE##_X: \ if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; \ JMP32_##OPCODE##_X: \ if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; \ JMP_##OPCODE##_K: \ if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; \ JMP32_##OPCODE##_K: \ if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; COND_JMP(u, JEQ, ==) COND_JMP(u, JNE, !=) COND_JMP(u, JGT, >) COND_JMP(u, JLT, <) COND_JMP(u, JGE, >=) COND_JMP(u, JLE, <=) COND_JMP(u, JSET, &) COND_JMP(s, JSGT, >) COND_JMP(s, JSLT, <) COND_JMP(s, JSGE, >=) COND_JMP(s, JSLE, <=) #undef COND_JMP /* ST, STX and LDX*/ ST_NOSPEC: /* Speculation barrier for mitigating Speculative Store Bypass. * In case of arm64, we rely on the firmware mitigation as * controlled via the ssbd kernel parameter. Whenever the * mitigation is enabled, it works for all of the kernel code * with no need to provide any additional instructions here. * In case of x86, we use 'lfence' insn for mitigation. We * reuse preexisting logic from Spectre v1 mitigation that * happens to produce the required code on x86 for v4 as well. */ barrier_nospec(); CONT; #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ CONT; \ ST_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ CONT; \ LDX_MEM_##SIZEOP: \ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ CONT; \ LDX_PROBE_MEM_##SIZEOP: \ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \ (const void *)(long) (SRC + insn->off)); \ DST = *((SIZE *)&DST); \ CONT; LDST(B, u8) LDST(H, u16) LDST(W, u32) LDST(DW, u64) #undef LDST #define LDSX(SIZEOP, SIZE) \ LDX_MEMSX_##SIZEOP: \ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ CONT; \ LDX_PROBE_MEMSX_##SIZEOP: \ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \ (const void *)(long) (SRC + insn->off)); \ DST = *((SIZE *)&DST); \ CONT; LDSX(B, s8) LDSX(H, s16) LDSX(W, s32) #undef LDSX #define ATOMIC_ALU_OP(BOP, KOP) \ case BOP: \ if (BPF_SIZE(insn->code) == BPF_W) \ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ (DST + insn->off)); \ else \ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ (DST + insn->off)); \ break; \ case BOP | BPF_FETCH: \ if (BPF_SIZE(insn->code) == BPF_W) \ SRC = (u32) atomic_fetch_##KOP( \ (u32) SRC, \ (atomic_t *)(unsigned long) (DST + insn->off)); \ else \ SRC = (u64) atomic64_fetch_##KOP( \ (u64) SRC, \ (atomic64_t *)(unsigned long) (DST + insn->off)); \ break; STX_ATOMIC_DW: STX_ATOMIC_W: switch (IMM) { ATOMIC_ALU_OP(BPF_ADD, add) ATOMIC_ALU_OP(BPF_AND, and) ATOMIC_ALU_OP(BPF_OR, or) ATOMIC_ALU_OP(BPF_XOR, xor) #undef ATOMIC_ALU_OP case BPF_XCHG: if (BPF_SIZE(insn->code) == BPF_W) SRC = (u32) atomic_xchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) SRC); else SRC = (u64) atomic64_xchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) SRC); break; case BPF_CMPXCHG: if (BPF_SIZE(insn->code) == BPF_W) BPF_R0 = (u32) atomic_cmpxchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) BPF_R0, (u32) SRC); else BPF_R0 = (u64) atomic64_cmpxchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) BPF_R0, (u64) SRC); break; default: goto default_label; } CONT; default_label: /* If we ever reach this, we have a bug somewhere. Die hard here * instead of just returning 0; we could be somewhere in a subprog, * so execution could continue otherwise which we do /not/ want. * * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). */ pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n", insn->code, insn->imm); BUG_ON(1); return 0; } #define PROG_NAME(stack_size) __bpf_prog_run##stack_size #define DEFINE_BPF_PROG_RUN(stack_size) \ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG] = {}; \ \ kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ return ___bpf_prog_run(regs, insn); \ } #define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size #define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG]; \ \ kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ BPF_R2 = r2; \ BPF_R3 = r3; \ BPF_R4 = r4; \ BPF_R5 = r5; \ return ___bpf_prog_run(regs, insn); \ } #define EVAL1(FN, X) FN(X) #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) #define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y) #define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y) #define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y) EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512); #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), static unsigned int (*interpreters[])(const void *ctx, const struct bpf_insn *insn) = { EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST #define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), static __maybe_unused u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, const struct bpf_insn *insn) = { EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST #ifdef CONFIG_BPF_SYSCALL void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) { stack_depth = max_t(u32, stack_depth, 1); insn->off = (s16) insn->imm; insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - __bpf_call_base_args; insn->code = BPF_JMP | BPF_CALL_ARGS; } #endif #else static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON * is not working properly, so warn about it! */ WARN_ON_ONCE(1); return 0; } #endif bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) { enum bpf_prog_type prog_type = resolve_prog_type(fp); bool ret; struct bpf_prog_aux *aux = fp->aux; if (fp->kprobe_override) return false; /* XDP programs inserted into maps are not guaranteed to run on * a particular netdev (and can run outside driver context entirely * in the case of devmap and cpumap). Until device checks * are implemented, prohibit adding dev-bound programs to program maps. */ if (bpf_prog_is_dev_bound(aux)) return false; spin_lock(&map->owner.lock); if (!map->owner.type) { /* There's no owner yet where we could check for * compatibility. */ map->owner.type = prog_type; map->owner.jited = fp->jited; map->owner.xdp_has_frags = aux->xdp_has_frags; map->owner.attach_func_proto = aux->attach_func_proto; ret = true; } else { ret = map->owner.type == prog_type && map->owner.jited == fp->jited && map->owner.xdp_has_frags == aux->xdp_has_frags; if (ret && map->owner.attach_func_proto != aux->attach_func_proto) { switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_STRUCT_OPS: ret = false; break; default: break; } } } spin_unlock(&map->owner.lock); return ret; } static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; int i, ret = 0; mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; if (!map_type_contains_progs(map)) continue; if (!bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } } out: mutex_unlock(&aux->used_maps_mutex); return ret; } static void bpf_prog_select_func(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; #else fp->bpf_func = __bpf_prog_ret0_warn; #endif } /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with BPF program * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. * The BPF program will be executed via bpf_prog_run() function. * * Return: the &fp argument along with &err set to 0 for success or * a negative errno code on failure */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ bool jit_needed = false; if (fp->bpf_func) goto finalize; if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) || bpf_prog_has_kfunc_call(fp)) jit_needed = true; bpf_prog_select_func(fp); /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during * blinding, bpf_int_jit_compile() must always return a * valid program, which in this case would simply not * be JITed, but falls back to the interpreter. */ if (!bpf_prog_is_offloaded(fp->aux)) { *err = bpf_prog_alloc_jited_linfo(fp); if (*err) return fp; fp = bpf_int_jit_compile(fp); bpf_prog_jit_attempt_done(fp); if (!fp->jited && jit_needed) { *err = -ENOTSUPP; return fp; } } else { *err = bpf_prog_offload_compile(fp); if (*err) return fp; } finalize: *err = bpf_prog_lock_ro(fp); if (*err) return fp; /* The tail call compatibility check can only be done at * this late stage as we need to determine, if we deal * with JITed or non JITed program concatenations and not * all eBPF JITs might immediately support all features. */ *err = bpf_check_tail_call(fp); return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); static unsigned int __bpf_prog_ret1(const void *ctx, const struct bpf_insn *insn) { return 1; } static struct bpf_prog_dummy { struct bpf_prog prog; } dummy_bpf_prog = { .prog = { .bpf_func = __bpf_prog_ret1, }, }; struct bpf_empty_prog_array bpf_empty_prog_array = { .null_prog = NULL, }; EXPORT_SYMBOL(bpf_empty_prog_array); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { struct bpf_prog_array *p; if (prog_cnt) p = kzalloc(struct_size(p, items, prog_cnt + 1), flags); else p = &bpf_empty_prog_array.hdr; return p; } void bpf_prog_array_free(struct bpf_prog_array *progs) { if (!progs || progs == &bpf_empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) { struct bpf_prog_array *progs; /* If RCU Tasks Trace grace period implies RCU grace period, there is * no need to call kfree_rcu(), just call kfree() directly. */ progs = container_of(rcu, struct bpf_prog_array, rcu); if (rcu_trace_implies_rcu_gp()) kfree(progs); else kfree_rcu(progs, rcu); } void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) { if (!progs || progs == &bpf_empty_prog_array.hdr) return; call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb); } int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; u32 cnt = 0; for (item = array->items; item->prog; item++) if (item->prog != &dummy_bpf_prog.prog) cnt++; return cnt; } bool bpf_prog_array_is_empty(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; for (item = array->items; item->prog; item++) if (item->prog != &dummy_bpf_prog.prog) return false; return true; } static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt) { struct bpf_prog_array_item *item; int i = 0; for (item = array->items; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; prog_ids[i] = item->prog->aux->id; if (++i == request_cnt) { item++; break; } } return !!(item->prog); } int bpf_prog_array_copy_to_user(struct bpf_prog_array *array, __u32 __user *prog_ids, u32 cnt) { unsigned long err = 0; bool nospc; u32 *ids; /* users of this function are doing: * cnt = bpf_prog_array_length(); * if (cnt > 0) * bpf_prog_array_copy_to_user(..., cnt); * so below kcalloc doesn't need extra cnt > 0 check. */ ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; nospc = bpf_prog_array_copy_core(array, ids, cnt); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); if (err) return -EFAULT; if (nospc) return -ENOSPC; return 0; } void bpf_prog_array_delete_safe(struct bpf_prog_array *array, struct bpf_prog *old_prog) { struct bpf_prog_array_item *item; for (item = array->items; item->prog; item++) if (item->prog == old_prog) { WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } /** * bpf_prog_array_delete_safe_at() - Replaces the program at the given * index into the program array with * a dummy no-op program. * @array: a bpf_prog_array * @index: the index of the program to replace * * Skips over dummy programs, by not counting them, when calculating * the position of the program to replace. * * Return: * * 0 - Success * * -EINVAL - Invalid index value. Must be a non-negative integer. * * -ENOENT - Index out of range */ int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index) { return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog); } /** * bpf_prog_array_update_at() - Updates the program at the given index * into the program array. * @array: a bpf_prog_array * @index: the index of the program to update * @prog: the program to insert into the array * * Skips over dummy programs, by not counting them, when calculating * the position of the program to update. * * Return: * * 0 - Success * * -EINVAL - Invalid index value. Must be a non-negative integer. * * -ENOENT - Index out of range */ int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, struct bpf_prog *prog) { struct bpf_prog_array_item *item; if (unlikely(index < 0)) return -EINVAL; for (item = array->items; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; if (!index) { WRITE_ONCE(item->prog, prog); return 0; } index--; } return -ENOENT; } int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, u64 bpf_cookie, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; struct bpf_prog_array_item *existing, *new; struct bpf_prog_array *array; bool found_exclude = false; /* Figure out how many existing progs we need to carry over to * the new array. */ if (old_array) { existing = old_array->items; for (; existing->prog; existing++) { if (existing->prog == exclude_prog) { found_exclude = true; continue; } if (existing->prog != &dummy_bpf_prog.prog) carry_prog_cnt++; if (existing->prog == include_prog) return -EEXIST; } } if (exclude_prog && !found_exclude) return -ENOENT; /* How many progs (not NULL) will be in the new array? */ new_prog_cnt = carry_prog_cnt; if (include_prog) new_prog_cnt += 1; /* Do we have any prog (not NULL) in the new array? */ if (!new_prog_cnt) { *new_array = NULL; return 0; } /* +1 as the end of prog_array is marked with NULL */ array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL); if (!array) return -ENOMEM; new = array->items; /* Fill in the new prog array */ if (carry_prog_cnt) { existing = old_array->items; for (; existing->prog; existing++) { if (existing->prog == exclude_prog || existing->prog == &dummy_bpf_prog.prog) continue; new->prog = existing->prog; new->bpf_cookie = existing->bpf_cookie; new++; } } if (include_prog) { new->prog = include_prog; new->bpf_cookie = bpf_cookie; new++; } new->prog = NULL; *new_array = array; return 0; } int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { u32 cnt = 0; if (array) cnt = bpf_prog_array_length(array); *prog_cnt = cnt; /* return early if user requested only program count or nothing to copy */ if (!request_cnt || !cnt) return 0; /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC : 0; } void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len) { struct bpf_map *map; bool sleepable; u32 i; sleepable = aux->prog->sleepable; for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) map->ops->map_poke_untrack(map, aux); if (sleepable) atomic64_dec(&map->sleepable_refcnt); bpf_map_put(map); } } static void bpf_free_used_maps(struct bpf_prog_aux *aux) { __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt); kfree(aux->used_maps); } void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len) { #ifdef CONFIG_BPF_SYSCALL struct btf_mod_pair *btf_mod; u32 i; for (i = 0; i < len; i++) { btf_mod = &used_btfs[i]; if (btf_mod->module) module_put(btf_mod->module); btf_put(btf_mod->btf); } #endif } static void bpf_free_used_btfs(struct bpf_prog_aux *aux) { __bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt); kfree(aux->used_btfs); } static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; int i; aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); #endif #ifdef CONFIG_CGROUP_BPF if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) bpf_cgroup_atype_put(aux->cgroup_atype); #endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_dev_bound_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS if (aux->prog->has_callchain_buf) put_callchain_buffers(); #endif if (aux->dst_trampoline) bpf_trampoline_put(aux->dst_trampoline); for (i = 0; i < aux->real_func_cnt; i++) { /* We can just unlink the subprog poke descriptor table as * it was originally linked to the main program and is also * released along with it. */ aux->func[i]->aux->poke_tab = NULL; bpf_jit_free(aux->func[i]); } if (aux->real_func_cnt) { kfree(aux->func); bpf_prog_unlock_free(aux->prog); } else { bpf_jit_free(aux->prog); } } void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; if (aux->dst_prog) bpf_prog_put(aux->dst_prog); bpf_token_put(aux->token); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); /* RNG for unprivileged user space with separated state from prandom_u32(). */ static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); void bpf_user_rnd_init_once(void) { prandom_init_once(&bpf_user_rnd_state); } BPF_CALL_0(bpf_user_rnd_u32) { /* Should someone ever have the rather unwise idea to use some * of the registers passed into this function, then note that * this function is called from native eBPF and classic-to-eBPF * transformations. Register assignments from both sides are * different, f.e. classic always sets fn(ctx, A, X) here. */ struct rnd_state *state; u32 res; state = &get_cpu_var(bpf_user_rnd_state); res = prandom_u32_state(state); put_cpu_var(bpf_user_rnd_state); return res; } BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); } /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_jiffies64_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; const struct bpf_func_proto bpf_set_retval_proto __weak; const struct bpf_func_proto bpf_get_retval_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { return NULL; } const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void) { return NULL; } u64 __weak bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { return -ENOTSUPP; } EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { .func = NULL, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; /* Stub for JITs that only support cBPF. eBPF programs are interpreted. * It is encouraged to implement bpf_int_jit_compile() instead, so that * eBPF and implicitly also cBPF can get JITed! */ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) { return prog; } /* Stub for JITs that support eBPF. All cBPF code gets transformed into * eBPF by the kernel and is later compiled by bpf_int_jit_compile(). */ void __weak bpf_jit_compile(struct bpf_prog *prog) { } bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id) { return false; } /* Return TRUE if the JIT backend wants verifier to enable sub-register usage * analysis code and wants explicit zero extension inserted by verifier. * Otherwise, return FALSE. * * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if * you don't override this. JITs that don't want these extra insns can detect * them using insn_is_zext. */ bool __weak bpf_jit_needs_zext(void) { return false; } /* Return true if the JIT inlines the call to the helper corresponding to * the imm. * * The verifier will not patch the insn->imm for the call to the helper if * this returns true. */ bool __weak bpf_jit_inlines_helper_call(s32 imm) { return false; } /* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ bool __weak bpf_jit_supports_subprog_tailcalls(void) { return false; } bool __weak bpf_jit_supports_percpu_insn(void) { return false; } bool __weak bpf_jit_supports_kfunc_call(void) { return false; } bool __weak bpf_jit_supports_far_kfunc_call(void) { return false; } bool __weak bpf_jit_supports_arena(void) { return false; } bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) { return false; } u64 __weak bpf_arch_uaddress_limit(void) { #if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) return TASK_SIZE; #else return 0; #endif } /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same * as atomic_xchg() on pointer-sized words. */ bool __weak bpf_jit_supports_ptr_xchg(void) { return false; } /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) { return -EFAULT; } int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2) { return -ENOTSUPP; } void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) { return ERR_PTR(-ENOTSUPP); } int __weak bpf_arch_text_invalidate(void *dst, size_t len) { return -ENOTSUPP; } bool __weak bpf_jit_supports_exceptions(void) { return false; } bool __weak bpf_jit_supports_private_stack(void) { return false; } void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) { } /* for configs without MMU or 32-bit */ __weak const struct bpf_map_ops arena_map_ops; __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) { return 0; } __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) { return 0; } #ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) { int ret; ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); bpf_global_ma_set = !ret; return ret; } late_initcall(bpf_global_ma_init); #endif DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); /* All definitions of tracepoints related to BPF. */ #define CREATE_TRACE_POINTS #include <linux/bpf_trace.h> EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
5 5 6 19 19 17 25 5 2 52 52 51 52 45 10 6 2 5 5 5 3 3 929 907 40 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2018 Protonic, // Robin van der Gracht <robin@protonic.nl> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> /* Core of can-j1939 that links j1939 to CAN. */ #include <linux/can/can-ml.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/if_arp.h> #include <linux/module.h> #include "j1939-priv.h" MODULE_DESCRIPTION("PF_CAN SAE J1939"); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("EIA Electronics (Kurt Van Dijck & Pieter Beyens)"); MODULE_ALIAS("can-proto-" __stringify(CAN_J1939)); /* LOWLEVEL CAN interface */ /* CAN_HDR: #bytes before can_frame data part */ #define J1939_CAN_HDR (offsetof(struct can_frame, data)) /* lowest layer */ static void j1939_can_recv(struct sk_buff *iskb, void *data) { struct j1939_priv *priv = data; struct sk_buff *skb; struct j1939_sk_buff_cb *skcb, *iskcb; struct can_frame *cf; /* make sure we only get Classical CAN frames */ if (!can_is_can_skb(iskb)) return; /* create a copy of the skb * j1939 only delivers the real data bytes, * the header goes into sockaddr. * j1939 may not touch the incoming skb in such way */ skb = skb_clone(iskb, GFP_ATOMIC); if (!skb) return; j1939_priv_get(priv); can_skb_set_owner(skb, iskb->sk); /* get a pointer to the header of the skb * the skb payload (pointer) is moved, so that the next skb_data * returns the actual payload */ cf = (void *)skb->data; skb_pull(skb, J1939_CAN_HDR); /* fix length, set to dlc, with 8 maximum */ skb_trim(skb, min_t(uint8_t, cf->len, 8)); /* set addr */ skcb = j1939_skb_to_cb(skb); memset(skcb, 0, sizeof(*skcb)); iskcb = j1939_skb_to_cb(iskb); skcb->tskey = iskcb->tskey; skcb->priority = (cf->can_id >> 26) & 0x7; skcb->addr.sa = cf->can_id; skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX; /* set default message type */ skcb->addr.type = J1939_TP; if (!j1939_address_is_valid(skcb->addr.sa)) { netdev_err_once(priv->ndev, "%s: sa is broadcast address, ignoring!\n", __func__); goto done; } if (j1939_pgn_is_pdu1(skcb->addr.pgn)) { /* Type 1: with destination address */ skcb->addr.da = skcb->addr.pgn; /* normalize pgn: strip dst address */ skcb->addr.pgn &= 0x3ff00; } else { /* set broadcast address */ skcb->addr.da = J1939_NO_ADDR; } /* update localflags */ read_lock_bh(&priv->lock); if (j1939_address_is_unicast(skcb->addr.sa) && priv->ents[skcb->addr.sa].nusers) skcb->flags |= J1939_ECU_LOCAL_SRC; if (j1939_address_is_unicast(skcb->addr.da) && priv->ents[skcb->addr.da].nusers) skcb->flags |= J1939_ECU_LOCAL_DST; read_unlock_bh(&priv->lock); /* deliver into the j1939 stack ... */ j1939_ac_recv(priv, skb); if (j1939_tp_recv(priv, skb)) /* this means the transport layer processed the message */ goto done; j1939_simple_recv(priv, skb); j1939_sk_recv(priv, skb); done: j1939_priv_put(priv); kfree_skb(skb); } /* NETDEV MANAGEMENT */ /* values for can_rx_(un)register */ #define J1939_CAN_ID CAN_EFF_FLAG #define J1939_CAN_MASK (CAN_EFF_FLAG | CAN_RTR_FLAG) static DEFINE_MUTEX(j1939_netdev_lock); static struct j1939_priv *j1939_priv_create(struct net_device *ndev) { struct j1939_priv *priv; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return NULL; rwlock_init(&priv->lock); INIT_LIST_HEAD(&priv->ecus); priv->ndev = ndev; kref_init(&priv->kref); kref_init(&priv->rx_kref); dev_hold(ndev); netdev_dbg(priv->ndev, "%s : 0x%p\n", __func__, priv); return priv; } static inline void j1939_priv_set(struct net_device *ndev, struct j1939_priv *priv) { struct can_ml_priv *can_ml = can_get_ml_priv(ndev); can_ml->j1939_priv = priv; } static void __j1939_priv_release(struct kref *kref) { struct j1939_priv *priv = container_of(kref, struct j1939_priv, kref); struct net_device *ndev = priv->ndev; netdev_dbg(priv->ndev, "%s: 0x%p\n", __func__, priv); WARN_ON_ONCE(!list_empty(&priv->active_session_list)); WARN_ON_ONCE(!list_empty(&priv->ecus)); WARN_ON_ONCE(!list_empty(&priv->j1939_socks)); dev_put(ndev); kfree(priv); } void j1939_priv_put(struct j1939_priv *priv) { kref_put(&priv->kref, __j1939_priv_release); } void j1939_priv_get(struct j1939_priv *priv) { kref_get(&priv->kref); } static int j1939_can_rx_register(struct j1939_priv *priv) { struct net_device *ndev = priv->ndev; int ret; j1939_priv_get(priv); ret = can_rx_register(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK, j1939_can_recv, priv, "j1939", NULL); if (ret < 0) { j1939_priv_put(priv); return ret; } return 0; } static void j1939_can_rx_unregister(struct j1939_priv *priv) { struct net_device *ndev = priv->ndev; can_rx_unregister(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK, j1939_can_recv, priv); /* The last reference of priv is dropped by the RCU deferred * j1939_sk_sock_destruct() of the last socket, so we can * safely drop this reference here. */ j1939_priv_put(priv); } static void __j1939_rx_release(struct kref *kref) __releases(&j1939_netdev_lock) { struct j1939_priv *priv = container_of(kref, struct j1939_priv, rx_kref); j1939_can_rx_unregister(priv); j1939_ecu_unmap_all(priv); j1939_priv_set(priv->ndev, NULL); mutex_unlock(&j1939_netdev_lock); } /* get pointer to priv without increasing ref counter */ static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev) { struct can_ml_priv *can_ml = can_get_ml_priv(ndev); return can_ml->j1939_priv; } static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev) { struct j1939_priv *priv; lockdep_assert_held(&j1939_netdev_lock); priv = j1939_ndev_to_priv(ndev); if (priv) j1939_priv_get(priv); return priv; } static struct j1939_priv *j1939_priv_get_by_ndev(struct net_device *ndev) { struct j1939_priv *priv; mutex_lock(&j1939_netdev_lock); priv = j1939_priv_get_by_ndev_locked(ndev); mutex_unlock(&j1939_netdev_lock); return priv; } struct j1939_priv *j1939_netdev_start(struct net_device *ndev) { struct j1939_priv *priv, *priv_new; int ret; mutex_lock(&j1939_netdev_lock); priv = j1939_priv_get_by_ndev_locked(ndev); if (priv) { kref_get(&priv->rx_kref); mutex_unlock(&j1939_netdev_lock); return priv; } mutex_unlock(&j1939_netdev_lock); priv = j1939_priv_create(ndev); if (!priv) return ERR_PTR(-ENOMEM); j1939_tp_init(priv); rwlock_init(&priv->j1939_socks_lock); INIT_LIST_HEAD(&priv->j1939_socks); mutex_lock(&j1939_netdev_lock); priv_new = j1939_priv_get_by_ndev_locked(ndev); if (priv_new) { /* Someone was faster than us, use their priv and roll * back our's. */ kref_get(&priv_new->rx_kref); mutex_unlock(&j1939_netdev_lock); dev_put(ndev); kfree(priv); return priv_new; } j1939_priv_set(ndev, priv); ret = j1939_can_rx_register(priv); if (ret < 0) goto out_priv_put; mutex_unlock(&j1939_netdev_lock); return priv; out_priv_put: j1939_priv_set(ndev, NULL); mutex_unlock(&j1939_netdev_lock); dev_put(ndev); kfree(priv); return ERR_PTR(ret); } void j1939_netdev_stop(struct j1939_priv *priv) { kref_put_mutex(&priv->rx_kref, __j1939_rx_release, &j1939_netdev_lock); j1939_priv_put(priv); } int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb) { int ret, dlc; canid_t canid; struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct can_frame *cf; /* apply sanity checks */ if (j1939_pgn_is_pdu1(skcb->addr.pgn)) skcb->addr.pgn &= J1939_PGN_PDU1_MAX; else skcb->addr.pgn &= J1939_PGN_MAX; if (skcb->priority > 7) skcb->priority = 6; ret = j1939_ac_fixup(priv, skb); if (unlikely(ret)) goto failed; dlc = skb->len; /* re-claim the CAN_HDR from the SKB */ cf = skb_push(skb, J1939_CAN_HDR); /* initialize header structure */ memset(cf, 0, J1939_CAN_HDR); /* make it a full can frame again */ skb_put_zero(skb, 8 - dlc); canid = CAN_EFF_FLAG | (skcb->priority << 26) | (skcb->addr.pgn << 8) | skcb->addr.sa; if (j1939_pgn_is_pdu1(skcb->addr.pgn)) canid |= skcb->addr.da << 8; cf->can_id = canid; cf->len = dlc; return can_send(skb, 1); failed: kfree_skb(skb); return ret; } static int j1939_netdev_notify(struct notifier_block *nb, unsigned long msg, void *data) { struct net_device *ndev = netdev_notifier_info_to_dev(data); struct can_ml_priv *can_ml = can_get_ml_priv(ndev); struct j1939_priv *priv; if (!can_ml) goto notify_done; priv = j1939_priv_get_by_ndev(ndev); if (!priv) goto notify_done; switch (msg) { case NETDEV_DOWN: j1939_cancel_active_session(priv, NULL); j1939_sk_netdev_event_netdown(priv); j1939_ecu_unmap_all(priv); break; } j1939_priv_put(priv); notify_done: return NOTIFY_DONE; } static struct notifier_block j1939_netdev_notifier = { .notifier_call = j1939_netdev_notify, }; /* MODULE interface */ static __init int j1939_module_init(void) { int ret; pr_info("can: SAE J1939\n"); ret = register_netdevice_notifier(&j1939_netdev_notifier); if (ret) goto fail_notifier; ret = can_proto_register(&j1939_can_proto); if (ret < 0) { pr_err("can: registration of j1939 protocol failed\n"); goto fail_sk; } return 0; fail_sk: unregister_netdevice_notifier(&j1939_netdev_notifier); fail_notifier: return ret; } static __exit void j1939_module_exit(void) { can_proto_unregister(&j1939_can_proto); unregister_netdevice_notifier(&j1939_netdev_notifier); } module_init(j1939_module_init); module_exit(j1939_module_exit);
1 1 4 4 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 /* SPDX-License-Identifier: GPL-2.0 */ /* * Declarations of AX.25 type objects. * * Alan Cox (GW4PTS) 10/11/93 */ #ifndef _AX25_H #define _AX25_H #include <linux/ax25.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/refcount.h> #include <net/neighbour.h> #include <net/sock.h> #include <linux/seq_file.h> #define AX25_T1CLAMPLO 1 #define AX25_T1CLAMPHI (30 * HZ) #define AX25_BPQ_HEADER_LEN 16 #define AX25_KISS_HEADER_LEN 1 #define AX25_HEADER_LEN 17 #define AX25_ADDR_LEN 7 #define AX25_DIGI_HEADER_LEN (AX25_MAX_DIGIS * AX25_ADDR_LEN) #define AX25_MAX_HEADER_LEN (AX25_HEADER_LEN + AX25_DIGI_HEADER_LEN) /* AX.25 Protocol IDs */ #define AX25_P_ROSE 0x01 #define AX25_P_VJCOMP 0x06 /* Compressed TCP/IP packet */ /* Van Jacobsen (RFC 1144) */ #define AX25_P_VJUNCOMP 0x07 /* Uncompressed TCP/IP packet */ /* Van Jacobsen (RFC 1144) */ #define AX25_P_SEGMENT 0x08 /* Segmentation fragment */ #define AX25_P_TEXNET 0xc3 /* TEXTNET datagram protocol */ #define AX25_P_LQ 0xc4 /* Link Quality Protocol */ #define AX25_P_ATALK 0xca /* Appletalk */ #define AX25_P_ATALK_ARP 0xcb /* Appletalk ARP */ #define AX25_P_IP 0xcc /* ARPA Internet Protocol */ #define AX25_P_ARP 0xcd /* ARPA Address Resolution */ #define AX25_P_FLEXNET 0xce /* FlexNet */ #define AX25_P_NETROM 0xcf /* NET/ROM */ #define AX25_P_TEXT 0xF0 /* No layer 3 protocol impl. */ /* AX.25 Segment control values */ #define AX25_SEG_REM 0x7F #define AX25_SEG_FIRST 0x80 #define AX25_CBIT 0x80 /* Command/Response bit */ #define AX25_EBIT 0x01 /* HDLC Address Extension bit */ #define AX25_HBIT 0x80 /* Has been repeated bit */ #define AX25_SSSID_SPARE 0x60 /* Unused bits in SSID for standard AX.25 */ #define AX25_ESSID_SPARE 0x20 /* Unused bits in SSID for extended AX.25 */ #define AX25_DAMA_FLAG 0x20 /* Well, it is *NOT* unused! (dl1bke 951121 */ #define AX25_COND_ACK_PENDING 0x01 #define AX25_COND_REJECT 0x02 #define AX25_COND_PEER_RX_BUSY 0x04 #define AX25_COND_OWN_RX_BUSY 0x08 #define AX25_COND_DAMA_MODE 0x10 #ifndef _LINUX_NETDEVICE_H #include <linux/netdevice.h> #endif /* Upper sub-layer (LAPB) definitions */ /* Control field templates */ #define AX25_I 0x00 /* Information frames */ #define AX25_S 0x01 /* Supervisory frames */ #define AX25_RR 0x01 /* Receiver ready */ #define AX25_RNR 0x05 /* Receiver not ready */ #define AX25_REJ 0x09 /* Reject */ #define AX25_U 0x03 /* Unnumbered frames */ #define AX25_SABM 0x2f /* Set Asynchronous Balanced Mode */ #define AX25_SABME 0x6f /* Set Asynchronous Balanced Mode Extended */ #define AX25_DISC 0x43 /* Disconnect */ #define AX25_DM 0x0f /* Disconnected mode */ #define AX25_UA 0x63 /* Unnumbered acknowledge */ #define AX25_FRMR 0x87 /* Frame reject */ #define AX25_UI 0x03 /* Unnumbered information */ #define AX25_XID 0xaf /* Exchange information */ #define AX25_TEST 0xe3 /* Test */ #define AX25_PF 0x10 /* Poll/final bit for standard AX.25 */ #define AX25_EPF 0x01 /* Poll/final bit for extended AX.25 */ #define AX25_ILLEGAL 0x100 /* Impossible to be a real frame type */ #define AX25_POLLOFF 0 #define AX25_POLLON 1 /* AX25 L2 C-bit */ #define AX25_COMMAND 1 #define AX25_RESPONSE 2 /* Define Link State constants. */ enum { AX25_STATE_0, /* Listening */ AX25_STATE_1, /* SABM sent */ AX25_STATE_2, /* DISC sent */ AX25_STATE_3, /* Established */ AX25_STATE_4 /* Recovery */ }; #define AX25_MODULUS 8 /* Standard AX.25 modulus */ #define AX25_EMODULUS 128 /* Extended AX.25 modulus */ enum { AX25_PROTO_STD_SIMPLEX, AX25_PROTO_STD_DUPLEX, #ifdef CONFIG_AX25_DAMA_SLAVE AX25_PROTO_DAMA_SLAVE, #ifdef CONFIG_AX25_DAMA_MASTER AX25_PROTO_DAMA_MASTER, #define AX25_PROTO_MAX AX25_PROTO_DAMA_MASTER #endif #endif __AX25_PROTO_MAX, AX25_PROTO_MAX = __AX25_PROTO_MAX -1 }; enum { AX25_VALUES_IPDEFMODE, /* 0=DG 1=VC */ AX25_VALUES_AXDEFMODE, /* 0=Normal 1=Extended Seq Nos */ AX25_VALUES_BACKOFF, /* 0=None 1=Linear 2=Exponential */ AX25_VALUES_CONMODE, /* Allow connected modes - 0=No 1=no "PID text" 2=all PIDs */ AX25_VALUES_WINDOW, /* Default window size for standard AX.25 */ AX25_VALUES_EWINDOW, /* Default window size for extended AX.25 */ AX25_VALUES_T1, /* Default T1 timeout value */ AX25_VALUES_T2, /* Default T2 timeout value */ AX25_VALUES_T3, /* Default T3 timeout value */ AX25_VALUES_IDLE, /* Connected mode idle timer */ AX25_VALUES_N2, /* Default N2 value */ AX25_VALUES_PACLEN, /* AX.25 MTU */ AX25_VALUES_PROTOCOL, /* Std AX.25, DAMA Slave, DAMA Master */ #ifdef CONFIG_AX25_DAMA_SLAVE AX25_VALUES_DS_TIMEOUT, /* DAMA Slave timeout */ #endif AX25_MAX_VALUES /* THIS MUST REMAIN THE LAST ENTRY OF THIS LIST */ }; #define AX25_DEF_IPDEFMODE 0 /* Datagram */ #define AX25_DEF_AXDEFMODE 0 /* Normal */ #define AX25_DEF_BACKOFF 1 /* Linear backoff */ #define AX25_DEF_CONMODE 2 /* Connected mode allowed */ #define AX25_DEF_WINDOW 2 /* Window=2 */ #define AX25_DEF_EWINDOW 32 /* Module-128 Window=32 */ #define AX25_DEF_T1 10000 /* T1=10s */ #define AX25_DEF_T2 3000 /* T2=3s */ #define AX25_DEF_T3 300000 /* T3=300s */ #define AX25_DEF_N2 10 /* N2=10 */ #define AX25_DEF_IDLE 0 /* Idle=None */ #define AX25_DEF_PACLEN 256 /* Paclen=256 */ #define AX25_DEF_PROTOCOL AX25_PROTO_STD_SIMPLEX /* Standard AX.25 */ #define AX25_DEF_DS_TIMEOUT 180000 /* DAMA timeout 3 minutes */ typedef struct ax25_uid_assoc { struct hlist_node uid_node; refcount_t refcount; kuid_t uid; ax25_address call; } ax25_uid_assoc; #define ax25_uid_for_each(__ax25, list) \ hlist_for_each_entry(__ax25, list, uid_node) #define ax25_uid_hold(ax25) \ refcount_inc(&((ax25)->refcount)) static inline void ax25_uid_put(ax25_uid_assoc *assoc) { if (refcount_dec_and_test(&assoc->refcount)) { kfree(assoc); } } typedef struct { ax25_address calls[AX25_MAX_DIGIS]; unsigned char repeated[AX25_MAX_DIGIS]; unsigned char ndigi; signed char lastrepeat; } ax25_digi; typedef struct ax25_route { struct ax25_route *next; ax25_address callsign; struct net_device *dev; ax25_digi *digipeat; char ip_mode; } ax25_route; void __ax25_put_route(ax25_route *ax25_rt); extern rwlock_t ax25_route_lock; static inline void ax25_route_lock_use(void) { read_lock(&ax25_route_lock); } static inline void ax25_route_lock_unuse(void) { read_unlock(&ax25_route_lock); } typedef struct { char slave; /* slave_mode? */ struct timer_list slave_timer; /* timeout timer */ unsigned short slave_timeout; /* when? */ } ax25_dama_info; struct ctl_table; typedef struct ax25_dev { struct list_head list; struct net_device *dev; netdevice_tracker dev_tracker; struct net_device *forward; struct ctl_table_header *sysheader; int values[AX25_MAX_VALUES]; #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_dama_info dama; #endif refcount_t refcount; bool device_up; } ax25_dev; typedef struct ax25_cb { struct hlist_node ax25_node; ax25_address source_addr, dest_addr; ax25_digi *digipeat; ax25_dev *ax25_dev; netdevice_tracker dev_tracker; unsigned char iamdigi; unsigned char state, modulus, pidincl; unsigned short vs, vr, va; unsigned char condition, backoff; unsigned char n2, n2count; struct timer_list t1timer, t2timer, t3timer, idletimer; unsigned long t1, t2, t3, idle, rtt; unsigned short paclen, fragno, fraglen; struct sk_buff_head write_queue; struct sk_buff_head reseq_queue; struct sk_buff_head ack_queue; struct sk_buff_head frag_queue; unsigned char window; struct timer_list timer, dtimer; struct sock *sk; /* Backlink to socket */ refcount_t refcount; } ax25_cb; struct ax25_sock { struct sock sk; struct ax25_cb *cb; }; #define ax25_sk(ptr) container_of_const(ptr, struct ax25_sock, sk) static inline struct ax25_cb *sk_to_ax25(const struct sock *sk) { return ax25_sk(sk)->cb; } #define ax25_for_each(__ax25, list) \ hlist_for_each_entry(__ax25, list, ax25_node) #define ax25_cb_hold(__ax25) \ refcount_inc(&((__ax25)->refcount)) static __inline__ void ax25_cb_put(ax25_cb *ax25) { if (refcount_dec_and_test(&ax25->refcount)) { kfree(ax25->digipeat); kfree(ax25); } } static inline void ax25_dev_hold(ax25_dev *ax25_dev) { refcount_inc(&ax25_dev->refcount); } static inline void ax25_dev_put(ax25_dev *ax25_dev) { if (refcount_dec_and_test(&ax25_dev->refcount)) { kfree(ax25_dev); } } static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->dev = dev; skb_reset_mac_header(skb); skb->pkt_type = PACKET_HOST; return htons(ETH_P_AX25); } /* af_ax25.c */ extern struct hlist_head ax25_list; extern spinlock_t ax25_list_lock; void ax25_cb_add(ax25_cb *); struct sock *ax25_find_listener(ax25_address *, int, struct net_device *, int); struct sock *ax25_get_socket(ax25_address *, ax25_address *, int); ax25_cb *ax25_find_cb(const ax25_address *, ax25_address *, ax25_digi *, struct net_device *); void ax25_send_to_raw(ax25_address *, struct sk_buff *, int); void ax25_destroy_socket(ax25_cb *); ax25_cb * __must_check ax25_create_cb(void); void ax25_fillin_cb(ax25_cb *, ax25_dev *); struct sock *ax25_make_new(struct sock *, struct ax25_dev *); /* ax25_addr.c */ extern const ax25_address ax25_bcast; extern const ax25_address ax25_defaddr; extern const ax25_address null_ax25_address; char *ax2asc(char *buf, const ax25_address *); void asc2ax(ax25_address *addr, const char *callsign); int ax25cmp(const ax25_address *, const ax25_address *); int ax25digicmp(const ax25_digi *, const ax25_digi *); const unsigned char *ax25_addr_parse(const unsigned char *, int, ax25_address *, ax25_address *, ax25_digi *, int *, int *); int ax25_addr_build(unsigned char *, const ax25_address *, const ax25_address *, const ax25_digi *, int, int); int ax25_addr_size(const ax25_digi *); void ax25_digi_invert(const ax25_digi *, ax25_digi *); /* ax25_dev.c */ extern spinlock_t ax25_dev_lock; #if IS_ENABLED(CONFIG_AX25) static inline ax25_dev *ax25_dev_ax25dev(struct net_device *dev) { return dev->ax25_ptr; } #endif ax25_dev *ax25_addr_ax25dev(ax25_address *); void ax25_dev_device_up(struct net_device *); void ax25_dev_device_down(struct net_device *); int ax25_fwd_ioctl(unsigned int, struct ax25_fwd_struct *); struct net_device *ax25_fwd_dev(struct net_device *); void ax25_dev_free(void); /* ax25_ds_in.c */ int ax25_ds_frame_in(ax25_cb *, struct sk_buff *, int); /* ax25_ds_subr.c */ void ax25_ds_nr_error_recovery(ax25_cb *); void ax25_ds_enquiry_response(ax25_cb *); void ax25_ds_establish_data_link(ax25_cb *); void ax25_dev_dama_off(ax25_dev *); void ax25_dama_on(ax25_cb *); void ax25_dama_off(ax25_cb *); /* ax25_ds_timer.c */ void ax25_ds_setup_timer(ax25_dev *); void ax25_ds_set_timer(ax25_dev *); void ax25_ds_del_timer(ax25_dev *); void ax25_ds_timer(ax25_cb *); void ax25_ds_t1_timeout(ax25_cb *); void ax25_ds_heartbeat_expiry(ax25_cb *); void ax25_ds_t3timer_expiry(ax25_cb *); void ax25_ds_idletimer_expiry(ax25_cb *); /* ax25_iface.c */ struct ax25_protocol { struct ax25_protocol *next; unsigned int pid; int (*func)(struct sk_buff *, ax25_cb *); }; void ax25_register_pid(struct ax25_protocol *ap); void ax25_protocol_release(unsigned int); struct ax25_linkfail { struct hlist_node lf_node; void (*func)(ax25_cb *, int); }; void ax25_linkfail_register(struct ax25_linkfail *lf); void ax25_linkfail_release(struct ax25_linkfail *lf); int __must_check ax25_listen_register(const ax25_address *, struct net_device *); void ax25_listen_release(const ax25_address *, struct net_device *); int(*ax25_protocol_function(unsigned int))(struct sk_buff *, ax25_cb *); int ax25_listen_mine(const ax25_address *, struct net_device *); void ax25_link_failed(ax25_cb *, int); int ax25_protocol_is_registered(unsigned int); /* ax25_in.c */ int ax25_rx_iframe(ax25_cb *, struct sk_buff *); int ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); /* ax25_ip.c */ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb); extern const struct header_ops ax25_header_ops; /* ax25_out.c */ ax25_cb *ax25_send_frame(struct sk_buff *, int, const ax25_address *, ax25_address *, ax25_digi *, struct net_device *); void ax25_output(ax25_cb *, int, struct sk_buff *); void ax25_kick(ax25_cb *); void ax25_transmit_buffer(ax25_cb *, struct sk_buff *, int); void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev); int ax25_check_iframes_acked(ax25_cb *, unsigned short); /* ax25_route.c */ void ax25_rt_device_down(struct net_device *); int ax25_rt_ioctl(unsigned int, void __user *); extern const struct seq_operations ax25_rt_seqops; ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev); int ax25_rt_autobind(ax25_cb *, ax25_address *); struct sk_buff *ax25_rt_build_path(struct sk_buff *, ax25_address *, ax25_address *, ax25_digi *); void ax25_rt_free(void); /* ax25_std_in.c */ int ax25_std_frame_in(ax25_cb *, struct sk_buff *, int); /* ax25_std_subr.c */ void ax25_std_nr_error_recovery(ax25_cb *); void ax25_std_establish_data_link(ax25_cb *); void ax25_std_transmit_enquiry(ax25_cb *); void ax25_std_enquiry_response(ax25_cb *); void ax25_std_timeout_response(ax25_cb *); /* ax25_std_timer.c */ void ax25_std_heartbeat_expiry(ax25_cb *); void ax25_std_t1timer_expiry(ax25_cb *); void ax25_std_t2timer_expiry(ax25_cb *); void ax25_std_t3timer_expiry(ax25_cb *); void ax25_std_idletimer_expiry(ax25_cb *); /* ax25_subr.c */ void ax25_clear_queues(ax25_cb *); void ax25_frames_acked(ax25_cb *, unsigned short); void ax25_requeue_frames(ax25_cb *); int ax25_validate_nr(ax25_cb *, unsigned short); int ax25_decode(ax25_cb *, struct sk_buff *, int *, int *, int *); void ax25_send_control(ax25_cb *, int, int, int); void ax25_return_dm(struct net_device *, ax25_address *, ax25_address *, ax25_digi *); void ax25_calculate_t1(ax25_cb *); void ax25_calculate_rtt(ax25_cb *); void ax25_disconnect(ax25_cb *, int); /* ax25_timer.c */ void ax25_setup_timers(ax25_cb *); void ax25_start_heartbeat(ax25_cb *); void ax25_start_t1timer(ax25_cb *); void ax25_start_t2timer(ax25_cb *); void ax25_start_t3timer(ax25_cb *); void ax25_start_idletimer(ax25_cb *); void ax25_stop_heartbeat(ax25_cb *); void ax25_stop_t1timer(ax25_cb *); void ax25_stop_t2timer(ax25_cb *); void ax25_stop_t3timer(ax25_cb *); void ax25_stop_idletimer(ax25_cb *); int ax25_t1timer_running(ax25_cb *); unsigned long ax25_display_timer(struct timer_list *); /* ax25_uid.c */ extern int ax25_uid_policy; ax25_uid_assoc *ax25_findbyuid(kuid_t); int __must_check ax25_uid_ioctl(int, struct sockaddr_ax25 *); extern const struct seq_operations ax25_uid_seqops; void ax25_uid_free(void); /* sysctl_net_ax25.c */ #ifdef CONFIG_SYSCTL int ax25_register_dev_sysctl(ax25_dev *ax25_dev); void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev); #else static inline int ax25_register_dev_sysctl(ax25_dev *ax25_dev) { return 0; } static inline void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev) {} #endif /* CONFIG_SYSCTL */ #endif
182 184 173 173 94 12 131 172 156 57 1 92 144 144 123 6 143 173 174 123 123 124 65 65 66 13 9 183 13 11 170 1 183 30 7 184 184 6 183 34 38 1 183 183 182 183 43 182 183 19 183 5 5 5 5 5 5 5 144 144 16 36 20 145 144 1 144 42 144 144 1 5 5 5 143 10 76 105 183 183 1 183 183 145 145 145 145 1 1 2 56 124 145 28 145 28 182 103 103 17 42 42 15 2 9 1 184 167 66 10 19 19 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * * This file is part of the SCTP kernel implementation * * These functions handle output processing. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@austin.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> #include <linux/time.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/init.h> #include <linux/slab.h> #include <net/inet_ecn.h> #include <net/ip.h> #include <net/icmp.h> #include <net/net_namespace.h> #include <linux/socket.h> /* for sa_family_t */ #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/checksum.h> /* Forward declarations for private helpers. */ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk); static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk); static void sctp_packet_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk); static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len); static void sctp_packet_reset(struct sctp_packet *packet) { /* sctp_packet_transmit() relies on this to reset size to the * current overhead after sending packets. */ packet->size = packet->overhead; packet->has_cookie_echo = 0; packet->has_sack = 0; packet->has_data = 0; packet->has_auth = 0; packet->ipfragok = 0; packet->auth = NULL; } /* Config a packet. * This appears to be a followup set of initializations. */ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, int ecn_capable) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctp_sock *sp = NULL; struct sock *sk; pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); packet->vtag = vtag; /* do the following jobs only once for a flush schedule */ if (!sctp_packet_empty(packet)) return; /* set packet max_size with pathmtu, then calculate overhead */ packet->max_size = tp->pathmtu; if (asoc) { sk = asoc->base.sk; sp = sctp_sk(sk); } packet->overhead = sctp_mtu_payload(sp, 0, 0); packet->size = packet->overhead; if (!asoc) return; /* update dst or transport pathmtu if in need */ if (!sctp_transport_dst_check(tp)) { sctp_transport_route(tp, NULL, sp); if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); } else if (!sctp_transport_pl_enabled(tp) && asoc->param_flags & SPP_PMTUD_ENABLE) { if (!sctp_transport_pmtu_check(tp)) sctp_assoc_sync_pmtu(asoc); } if (asoc->pmtu_pending) { if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); asoc->pmtu_pending = 0; } /* If there a is a prepend chunk stick it on the list before * any other chunks get appended. */ if (ecn_capable) { struct sctp_chunk *chunk = sctp_get_ecne_prepend(asoc); if (chunk) sctp_packet_append_chunk(packet, chunk); } if (!tp->dst) return; /* set packet max_size with gso_max_size if gso is enabled*/ rcu_read_lock(); if (__sk_dst_get(sk) != tp->dst) { dst_hold(tp->dst); sk_setup_caps(sk, tp->dst); } packet->max_size = sk_can_gso(sk) ? min(READ_ONCE(tp->dst->dev->gso_max_size), GSO_LEGACY_MAX_SIZE) : asoc->pathmtu; rcu_read_unlock(); } /* Initialize the packet structure. */ void sctp_packet_init(struct sctp_packet *packet, struct sctp_transport *transport, __u16 sport, __u16 dport) { pr_debug("%s: packet:%p transport:%p\n", __func__, packet, transport); packet->transport = transport; packet->source_port = sport; packet->destination_port = dport; INIT_LIST_HEAD(&packet->chunk_list); /* The overhead will be calculated by sctp_packet_config() */ packet->overhead = 0; sctp_packet_reset(packet); packet->vtag = 0; } /* Free a packet. */ void sctp_packet_free(struct sctp_packet *packet) { struct sctp_chunk *chunk, *tmp; pr_debug("%s: packet:%p\n", __func__, packet); list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { list_del_init(&chunk->list); sctp_chunk_free(chunk); } } /* This routine tries to append the chunk to the offered packet. If adding * the chunk causes the packet to exceed the path MTU and COOKIE_ECHO chunk * is not present in the packet, it transmits the input packet. * Data can be bundled with a packet containing a COOKIE_ECHO chunk as long * as it can fit in the packet, but any more data that does not fit in this * packet can be sent only after receiving the COOKIE_ACK. */ enum sctp_xmit sctp_packet_transmit_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk, int one_packet, gfp_t gfp) { enum sctp_xmit retval; pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__, packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { case SCTP_XMIT_PMTU_FULL: if (!packet->has_cookie_echo) { int error = 0; error = sctp_packet_transmit(packet, gfp); if (error < 0) chunk->skb->sk->sk_err = -error; /* If we have an empty packet, then we can NOT ever * return PMTU_FULL. */ if (!one_packet) retval = sctp_packet_append_chunk(packet, chunk); } break; case SCTP_XMIT_RWND_FULL: case SCTP_XMIT_OK: case SCTP_XMIT_DELAY: break; } return retval; } /* Try to bundle a pad chunk into a packet with a heartbeat chunk for PLPMTUTD probe */ static enum sctp_xmit sctp_packet_bundle_pad(struct sctp_packet *pkt, struct sctp_chunk *chunk) { struct sctp_transport *t = pkt->transport; struct sctp_chunk *pad; int overhead = 0; if (!chunk->pmtu_probe) return SCTP_XMIT_OK; /* calculate the Padding Data size for the pad chunk */ overhead += sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); overhead += sizeof(struct sctp_sender_hb_info) + sizeof(struct sctp_pad_chunk); pad = sctp_make_pad(t->asoc, t->pl.probe_size - overhead); if (!pad) return SCTP_XMIT_DELAY; list_add_tail(&pad->list, &pkt->chunk_list); pkt->size += SCTP_PAD4(ntohs(pad->chunk_hdr->length)); chunk->transport = t; return SCTP_XMIT_OK; } /* Try to bundle an auth chunk into the packet. */ static enum sctp_xmit sctp_packet_bundle_auth(struct sctp_packet *pkt, struct sctp_chunk *chunk) { struct sctp_association *asoc = pkt->transport->asoc; enum sctp_xmit retval = SCTP_XMIT_OK; struct sctp_chunk *auth; /* if we don't have an association, we can't do authentication */ if (!asoc) return retval; /* See if this is an auth chunk we are bundling or if * auth is already bundled. */ if (chunk->chunk_hdr->type == SCTP_CID_AUTH || pkt->has_auth) return retval; /* if the peer did not request this chunk to be authenticated, * don't do it */ if (!chunk->auth) return retval; auth = sctp_make_auth(asoc, chunk->shkey->key_id); if (!auth) return retval; auth->shkey = chunk->shkey; sctp_auth_shkey_hold(auth->shkey); retval = __sctp_packet_append_chunk(pkt, auth); if (retval != SCTP_XMIT_OK) sctp_chunk_free(auth); return retval; } /* Try to bundle a SACK with the packet. */ static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt, struct sctp_chunk *chunk) { enum sctp_xmit retval = SCTP_XMIT_OK; /* If sending DATA and haven't aleady bundled a SACK, try to * bundle one in to the packet. */ if (sctp_chunk_is_data(chunk) && !pkt->has_sack && !pkt->has_cookie_echo) { struct sctp_association *asoc; struct timer_list *timer; asoc = pkt->transport->asoc; timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; /* If the SACK timer is running, we have a pending SACK */ if (timer_pending(timer)) { struct sctp_chunk *sack; if (pkt->transport->sack_generation != pkt->transport->asoc->peer.sack_generation) return retval; asoc->a_rwnd = asoc->rwnd; sack = sctp_make_sack(asoc); if (sack) { retval = __sctp_packet_append_chunk(pkt, sack); if (retval != SCTP_XMIT_OK) { sctp_chunk_free(sack); goto out; } SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); asoc->stats.octrlchunks++; asoc->peer.sack_needed = 0; if (del_timer(timer)) sctp_association_put(asoc); } } } out: return retval; } /* Append a chunk to the offered packet reporting back any inability to do * so. */ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk) { __u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length)); enum sctp_xmit retval = SCTP_XMIT_OK; /* Check to see if this chunk will fit into the packet */ retval = sctp_packet_will_fit(packet, chunk, chunk_len); if (retval != SCTP_XMIT_OK) goto finish; /* We believe that this chunk is OK to add to the packet */ switch (chunk->chunk_hdr->type) { case SCTP_CID_DATA: case SCTP_CID_I_DATA: /* Account for the data being in the packet */ sctp_packet_append_data(packet, chunk); /* Disallow SACK bundling after DATA. */ packet->has_sack = 1; /* Disallow AUTH bundling after DATA */ packet->has_auth = 1; /* Let it be knows that packet has DATA in it */ packet->has_data = 1; /* timestamp the chunk for rtx purposes */ chunk->sent_at = jiffies; /* Mainly used for prsctp RTX policy */ chunk->sent_count++; break; case SCTP_CID_COOKIE_ECHO: packet->has_cookie_echo = 1; break; case SCTP_CID_SACK: packet->has_sack = 1; if (chunk->asoc) chunk->asoc->stats.osacks++; break; case SCTP_CID_AUTH: packet->has_auth = 1; packet->auth = chunk; break; } /* It is OK to send this chunk. */ list_add_tail(&chunk->list, &packet->chunk_list); packet->size += chunk_len; chunk->transport = packet->transport; finish: return retval; } /* Append a chunk to the offered packet reporting back any inability to do * so. */ enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk) { enum sctp_xmit retval = SCTP_XMIT_OK; pr_debug("%s: packet:%p chunk:%p\n", __func__, packet, chunk); /* Data chunks are special. Before seeing what else we can * bundle into this packet, check to see if we are allowed to * send this DATA. */ if (sctp_chunk_is_data(chunk)) { retval = sctp_packet_can_append_data(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; } /* Try to bundle AUTH chunk */ retval = sctp_packet_bundle_auth(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; /* Try to bundle SACK chunk */ retval = sctp_packet_bundle_sack(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; retval = __sctp_packet_append_chunk(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; retval = sctp_packet_bundle_pad(packet, chunk); finish: return retval; } static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb) { if (SCTP_OUTPUT_CB(head)->last == head) skb_shinfo(head)->frag_list = skb; else SCTP_OUTPUT_CB(head)->last->next = skb; SCTP_OUTPUT_CB(head)->last = skb; head->truesize += skb->truesize; head->data_len += skb->len; head->len += skb->len; refcount_add(skb->truesize, &head->sk->sk_wmem_alloc); __skb_header_release(skb); } static int sctp_packet_pack(struct sctp_packet *packet, struct sk_buff *head, int gso, gfp_t gfp) { struct sctp_transport *tp = packet->transport; struct sctp_auth_chunk *auth = NULL; struct sctp_chunk *chunk, *tmp; int pkt_count = 0, pkt_size; struct sock *sk = head->sk; struct sk_buff *nskb; int auth_len = 0; if (gso) { skb_shinfo(head)->gso_type = sk->sk_gso_type; SCTP_OUTPUT_CB(head)->last = head; } else { nskb = head; pkt_size = packet->size; goto merge; } do { /* calculate the pkt_size and alloc nskb */ pkt_size = packet->overhead; list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { int padded = SCTP_PAD4(chunk->skb->len); if (chunk == packet->auth) auth_len = padded; else if (auth_len + padded + packet->overhead > tp->pathmtu) return 0; else if (pkt_size + padded > tp->pathmtu) break; pkt_size += padded; } nskb = alloc_skb(pkt_size + MAX_HEADER, gfp); if (!nskb) return 0; skb_reserve(nskb, packet->overhead + MAX_HEADER); merge: /* merge chunks into nskb and append nskb into head list */ pkt_size -= packet->overhead; list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { int padding; list_del_init(&chunk->list); if (sctp_chunk_is_data(chunk)) { if (!sctp_chunk_retransmitted(chunk) && !tp->rto_pending) { chunk->rtt_in_progress = 1; tp->rto_pending = 1; } } padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len; if (padding) skb_put_zero(chunk->skb, padding); if (chunk == packet->auth) auth = (struct sctp_auth_chunk *) skb_tail_pointer(nskb); skb_put_data(nskb, chunk->skb->data, chunk->skb->len); pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n", chunk, sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), chunk->has_tsn ? "TSN" : "No TSN", chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0, ntohs(chunk->chunk_hdr->length), chunk->skb->len, chunk->rtt_in_progress); pkt_size -= SCTP_PAD4(chunk->skb->len); if (!sctp_chunk_is_data(chunk) && chunk != packet->auth) sctp_chunk_free(chunk); if (!pkt_size) break; } if (auth) { sctp_auth_calculate_hmac(tp->asoc, nskb, auth, packet->auth->shkey, gfp); /* free auth if no more chunks, or add it back */ if (list_empty(&packet->chunk_list)) sctp_chunk_free(packet->auth); else list_add(&packet->auth->list, &packet->chunk_list); } if (gso) sctp_packet_gso_append(head, nskb); pkt_count++; } while (!list_empty(&packet->chunk_list)); if (gso) { memset(head->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); skb_shinfo(head)->gso_segs = pkt_count; skb_shinfo(head)->gso_size = GSO_BY_FRAGS; goto chksum; } if (sctp_checksum_disable) return 1; if (!(tp->dst->dev->features & NETIF_F_SCTP_CRC) || dst_xfrm(tp->dst) || packet->ipfragok || tp->encap_port) { struct sctphdr *sh = (struct sctphdr *)skb_transport_header(head); sh->checksum = sctp_compute_cksum(head, 0); } else { chksum: head->ip_summed = CHECKSUM_PARTIAL; head->csum_not_inet = 1; head->csum_start = skb_transport_header(head) - head->head; head->csum_offset = offsetof(struct sctphdr, checksum); } return pkt_count; } /* All packets are sent to the network through this function from * sctp_outq_tail(). * * The return value is always 0 for now. */ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctp_chunk *chunk, *tmp; int pkt_count, gso = 0; struct sk_buff *head; struct sctphdr *sh; struct sock *sk; pr_debug("%s: packet:%p\n", __func__, packet); if (list_empty(&packet->chunk_list)) return 0; chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); sk = chunk->skb->sk; if (packet->size > tp->pathmtu && !packet->ipfragok && !chunk->pmtu_probe) { if (tp->pl.state == SCTP_PL_ERROR) { /* do IP fragmentation if in Error state */ packet->ipfragok = 1; } else { if (!sk_can_gso(sk)) { /* check gso */ pr_err_once("Trying to GSO but underlying device doesn't support it."); goto out; } gso = 1; } } /* alloc head skb */ head = alloc_skb((gso ? packet->overhead : packet->size) + MAX_HEADER, gfp); if (!head) goto out; skb_reserve(head, packet->overhead + MAX_HEADER); skb_set_owner_w(head, sk); /* set sctp header */ sh = skb_push(head, sizeof(struct sctphdr)); skb_reset_transport_header(head); sh->source = htons(packet->source_port); sh->dest = htons(packet->destination_port); sh->vtag = htonl(packet->vtag); sh->checksum = 0; /* drop packet if no dst */ if (!tp->dst) { IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(head); goto out; } /* pack up chunks */ pkt_count = sctp_packet_pack(packet, head, gso, gfp); if (!pkt_count) { kfree_skb(head); goto out; } pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len); /* start autoclose timer */ if (packet->has_data && sctp_state(asoc, ESTABLISHED) && asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) { struct timer_list *timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; unsigned long timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; if (!mod_timer(timer, jiffies + timeout)) sctp_association_hold(asoc); } /* sctp xmit */ tp->af_specific->ecn_capable(sk); if (asoc) { asoc->stats.opackets += pkt_count; if (asoc->peer.last_sent_to != tp) asoc->peer.last_sent_to = tp; } head->ignore_df = packet->ipfragok; if (tp->dst_pending_confirm) skb_set_dst_pending_confirm(head, 1); /* neighbour should be confirmed on successful transmission or * positive error */ if (tp->af_specific->sctp_xmit(head, tp) >= 0 && tp->dst_pending_confirm) tp->dst_pending_confirm = 0; out: list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { list_del_init(&chunk->list); if (!sctp_chunk_is_data(chunk)) sctp_chunk_free(chunk); } sctp_packet_reset(packet); return 0; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* This private function check to see if a chunk can be added */ static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk) { size_t datasize, rwnd, inflight, flight_size; struct sctp_transport *transport = packet->transport; struct sctp_association *asoc = transport->asoc; struct sctp_outq *q = &asoc->outqueue; /* RFC 2960 6.1 Transmission of DATA Chunks * * A) At any given time, the data sender MUST NOT transmit new data to * any destination transport address if its peer's rwnd indicates * that the peer has no buffer space (i.e. rwnd is 0, see Section * 6.2.1). However, regardless of the value of rwnd (including if it * is 0), the data sender can always have one DATA chunk in flight to * the receiver if allowed by cwnd (see rule B below). This rule * allows the sender to probe for a change in rwnd that the sender * missed due to the SACK having been lost in transit from the data * receiver to the data sender. */ rwnd = asoc->peer.rwnd; inflight = q->outstanding_bytes; flight_size = transport->flight_size; datasize = sctp_data_size(chunk); if (datasize > rwnd && inflight > 0) /* We have (at least) one data chunk in flight, * so we can't fall back to rule 6.1 B). */ return SCTP_XMIT_RWND_FULL; /* RFC 2960 6.1 Transmission of DATA Chunks * * B) At any given time, the sender MUST NOT transmit new data * to a given transport address if it has cwnd or more bytes * of data outstanding to that transport address. */ /* RFC 7.2.4 & the Implementers Guide 2.8. * * 3) ... * When a Fast Retransmit is being performed the sender SHOULD * ignore the value of cwnd and SHOULD NOT delay retransmission. */ if (chunk->fast_retransmit != SCTP_NEED_FRTX && flight_size >= transport->cwnd) return SCTP_XMIT_RWND_FULL; /* Nagle's algorithm to solve small-packet problem: * Inhibit the sending of new chunks when new outgoing data arrives * if any previously transmitted data on the connection remains * unacknowledged. */ if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) && !asoc->force_delay) /* Nothing unacked */ return SCTP_XMIT_OK; if (!sctp_packet_empty(packet)) /* Append to packet */ return SCTP_XMIT_OK; if (!sctp_state(asoc, ESTABLISHED)) return SCTP_XMIT_OK; /* Check whether this chunk and all the rest of pending data will fit * or delay in hopes of bundling a full sized packet. */ if (chunk->skb->len + q->out_qlen > transport->pathmtu - packet->overhead - sctp_datachk_len(&chunk->asoc->stream) - 4) /* Enough data queued to fill a packet */ return SCTP_XMIT_OK; /* Don't delay large message writes that may have been fragmented */ if (!chunk->msg->can_delay) return SCTP_XMIT_OK; /* Defer until all data acked or packet full */ return SCTP_XMIT_DELAY; } /* This private function does management things when adding DATA chunk */ static void sctp_packet_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk) { struct sctp_transport *transport = packet->transport; size_t datasize = sctp_data_size(chunk); struct sctp_association *asoc = transport->asoc; u32 rwnd = asoc->peer.rwnd; /* Keep track of how many bytes are in flight over this transport. */ transport->flight_size += datasize; /* Keep track of how many bytes are in flight to the receiver. */ asoc->outqueue.outstanding_bytes += datasize; /* Update our view of the receiver's rwnd. */ if (datasize < rwnd) rwnd -= datasize; else rwnd = 0; asoc->peer.rwnd = rwnd; sctp_chunk_assign_tsn(chunk); asoc->stream.si->assign_number(chunk); } static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len) { enum sctp_xmit retval = SCTP_XMIT_OK; size_t psize, pmtu, maxsize; /* Don't bundle in this packet if this chunk's auth key doesn't * match other chunks already enqueued on this packet. Also, * don't bundle the chunk with auth key if other chunks in this * packet don't have auth key. */ if ((packet->auth && chunk->shkey != packet->auth->shkey) || (!packet->auth && chunk->shkey && chunk->chunk_hdr->type != SCTP_CID_AUTH)) return SCTP_XMIT_PMTU_FULL; psize = packet->size; if (packet->transport->asoc) pmtu = packet->transport->asoc->pathmtu; else pmtu = packet->transport->pathmtu; /* Decide if we need to fragment or resubmit later. */ if (psize + chunk_len > pmtu) { /* It's OK to fragment at IP level if any one of the following * is true: * 1. The packet is empty (meaning this chunk is greater * the MTU) * 2. The packet doesn't have any data in it yet and data * requires authentication. */ if (sctp_packet_empty(packet) || (!packet->has_data && chunk->auth)) { /* We no longer do re-fragmentation. * Just fragment at the IP layer, if we * actually hit this condition */ packet->ipfragok = 1; goto out; } /* Similarly, if this chunk was built before a PMTU * reduction, we have to fragment it at IP level now. So * if the packet already contains something, we need to * flush. */ maxsize = pmtu - packet->overhead; if (packet->auth) maxsize -= SCTP_PAD4(packet->auth->skb->len); if (chunk_len > maxsize) retval = SCTP_XMIT_PMTU_FULL; /* It is also okay to fragment if the chunk we are * adding is a control chunk, but only if current packet * is not a GSO one otherwise it causes fragmentation of * a large frame. So in this case we allow the * fragmentation by forcing it to be in a new packet. */ if (!sctp_chunk_is_data(chunk) && packet->has_data) retval = SCTP_XMIT_PMTU_FULL; if (psize + chunk_len > packet->max_size) /* Hit GSO/PMTU limit, gotta flush */ retval = SCTP_XMIT_PMTU_FULL; if (!packet->transport->burst_limited && psize + chunk_len > (packet->transport->cwnd >> 1)) /* Do not allow a single GSO packet to use more * than half of cwnd. */ retval = SCTP_XMIT_PMTU_FULL; if (packet->transport->burst_limited && psize + chunk_len > (packet->transport->burst_limited >> 1)) /* Do not allow a single GSO packet to use more * than half of original cwnd. */ retval = SCTP_XMIT_PMTU_FULL; /* Otherwise it will fit in the GSO packet */ } out: return retval; }
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM smc #if !defined(_TRACE_SMC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SMC_H #include <linux/ipv6.h> #include <linux/tcp.h> #include <linux/tracepoint.h> #include <net/ipv6.h> #include "smc.h" #include "smc_core.h" TRACE_EVENT(smc_switch_to_fallback, TP_PROTO(const struct smc_sock *smc, int fallback_rsn), TP_ARGS(smc, fallback_rsn), TP_STRUCT__entry( __field(const void *, sk) __field(const void *, clcsk) __field(u64, net_cookie) __field(int, fallback_rsn) ), TP_fast_assign( const struct sock *sk = &smc->sk; const struct sock *clcsk = smc->clcsock->sk; __entry->sk = sk; __entry->clcsk = clcsk; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->fallback_rsn = fallback_rsn; ), TP_printk("sk=%p clcsk=%p net=%llu fallback_rsn=%d", __entry->sk, __entry->clcsk, __entry->net_cookie, __entry->fallback_rsn) ); DECLARE_EVENT_CLASS(smc_msg_event, TP_PROTO(const struct smc_sock *smc, size_t len), TP_ARGS(smc, len), TP_STRUCT__entry( __field(const void *, smc) __field(u64, net_cookie) __field(size_t, len) __string(name, smc->conn.lnk->ibname) ), TP_fast_assign( const struct sock *sk = &smc->sk; __entry->smc = smc; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->len = len; __assign_str(name); ), TP_printk("smc=%p net=%llu len=%zu dev=%s", __entry->smc, __entry->net_cookie, __entry->len, __get_str(name)) ); DEFINE_EVENT(smc_msg_event, smc_tx_sendmsg, TP_PROTO(const struct smc_sock *smc, size_t len), TP_ARGS(smc, len) ); DEFINE_EVENT(smc_msg_event, smc_rx_recvmsg, TP_PROTO(const struct smc_sock *smc, size_t len), TP_ARGS(smc, len) ); TRACE_EVENT(smcr_link_down, TP_PROTO(const struct smc_link *lnk, void *location), TP_ARGS(lnk, location), TP_STRUCT__entry( __field(const void *, lnk) __field(const void *, lgr) __field(u64, net_cookie) __field(int, state) __string(name, lnk->ibname) __field(void *, location) ), TP_fast_assign( const struct smc_link_group *lgr = lnk->lgr; __entry->lnk = lnk; __entry->lgr = lgr; __entry->net_cookie = lgr->net->net_cookie; __entry->state = lnk->state; __assign_str(name); __entry->location = location; ), TP_printk("lnk=%p lgr=%p net=%llu state=%d dev=%s location=%pS", __entry->lnk, __entry->lgr, __entry->net_cookie, __entry->state, __get_str(name), __entry->location) ); #endif /* _TRACE_SMC_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE smc_tracepoint #include <trace/define_trace.h>
929 930 173 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 // SPDX-License-Identifier: GPL-2.0-or-later /* * vrf.c: device driver to encapsulate a VRF space * * Copyright (c) 2015 Cumulus Networks. All rights reserved. * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> * * Based on dummy, team and ipvlan drivers */ #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ip.h> #include <linux/init.h> #include <linux/moduleparam.h> #include <linux/netfilter.h> #include <linux/rtnetlink.h> #include <net/rtnetlink.h> #include <linux/u64_stats_sync.h> #include <linux/hashtable.h> #include <linux/spinlock_types.h> #include <linux/inetdevice.h> #include <net/arp.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #include <net/fib_rules.h> #include <net/sch_generic.h> #include <net/netns/generic.h> #include <net/netfilter/nf_conntrack.h> #include <net/inet_dscp.h> #define DRV_NAME "vrf" #define DRV_VERSION "1.1" #define FIB_RULE_PREF 1000 /* default preference for FIB rules */ #define HT_MAP_BITS 4 #define HASH_INITVAL ((u32)0xcafef00d) struct vrf_map { DECLARE_HASHTABLE(ht, HT_MAP_BITS); spinlock_t vmap_lock; /* shared_tables: * count how many distinct tables do not comply with the strict mode * requirement. * shared_tables value must be 0 in order to enable the strict mode. * * example of the evolution of shared_tables: * | time * add vrf0 --> table 100 shared_tables = 0 | t0 * add vrf1 --> table 101 shared_tables = 0 | t1 * add vrf2 --> table 100 shared_tables = 1 | t2 * add vrf3 --> table 100 shared_tables = 1 | t3 * add vrf4 --> table 101 shared_tables = 2 v t4 * * shared_tables is a "step function" (or "staircase function") * and it is increased by one when the second vrf is associated to a * table. * * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1. * * at t3, another dev (vrf3) is bound to the same table 100 but the * value of shared_tables is still 1. * This means that no matter how many new vrfs will register on the * table 100, the shared_tables will not increase (considering only * table 100). * * at t4, vrf4 is bound to table 101, and shared_tables = 2. * * Looking at the value of shared_tables we can immediately know if * the strict_mode can or cannot be enforced. Indeed, strict_mode * can be enforced iff shared_tables = 0. * * Conversely, shared_tables is decreased when a vrf is de-associated * from a table with exactly two associated vrfs. */ u32 shared_tables; bool strict_mode; }; struct vrf_map_elem { struct hlist_node hnode; struct list_head vrf_list; /* VRFs registered to this table */ u32 table_id; int users; int ifindex; }; static unsigned int vrf_net_id; /* per netns vrf data */ struct netns_vrf { /* protected by rtnl lock */ bool add_fib_rules; struct vrf_map vmap; struct ctl_table_header *ctl_hdr; }; struct net_vrf { struct rtable __rcu *rth; struct rt6_info __rcu *rt6; #if IS_ENABLED(CONFIG_IPV6) struct fib6_table *fib6_table; #endif u32 tb_id; struct list_head me_list; /* entry in vrf_map_elem */ int ifindex; }; static void vrf_rx_stats(struct net_device *dev, int len) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); u64_stats_inc(&dstats->rx_packets); u64_stats_add(&dstats->rx_bytes, len); u64_stats_update_end(&dstats->syncp); } static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) { vrf_dev->stats.tx_errors++; kfree_skb(skb); } static struct vrf_map *netns_vrf_map(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); return &nn_vrf->vmap; } static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev) { return netns_vrf_map(dev_net(dev)); } static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me) { struct list_head *me_head = &me->vrf_list; struct net_vrf *vrf; if (list_empty(me_head)) return -ENODEV; vrf = list_first_entry(me_head, struct net_vrf, me_list); return vrf->ifindex; } static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags) { struct vrf_map_elem *me; me = kmalloc(sizeof(*me), flags); if (!me) return NULL; return me; } static void vrf_map_elem_free(struct vrf_map_elem *me) { kfree(me); } static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id, int ifindex, int users) { me->table_id = table_id; me->ifindex = ifindex; me->users = users; INIT_LIST_HEAD(&me->vrf_list); } static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap, u32 table_id) { struct vrf_map_elem *me; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_for_each_possible(vmap->ht, me, hnode, key) { if (me->table_id == table_id) return me; } return NULL; } static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me) { u32 table_id = me->table_id; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_add(vmap->ht, &me->hnode, key); } static void vrf_map_del_elem(struct vrf_map_elem *me) { hash_del(&me->hnode); } static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock) { spin_lock(&vmap->vmap_lock); } static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock) { spin_unlock(&vmap->vmap_lock); } /* called with rtnl lock held */ static int vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); struct vrf_map_elem *new_me, *me; u32 table_id = vrf->tb_id; bool free_new_me = false; int users; int res; /* we pre-allocate elements used in the spin-locked section (so that we * keep the spinlock as short as possible). */ new_me = vrf_map_elem_alloc(GFP_KERNEL); if (!new_me) return -ENOMEM; vrf_map_elem_init(new_me, table_id, dev->ifindex, 0); vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) { me = new_me; vrf_map_add_elem(vmap, me); goto link_vrf; } /* we already have an entry in the vrf_map, so it means there is (at * least) a vrf registered on the specific table. */ free_new_me = true; if (vmap->strict_mode) { /* vrfs cannot share the same table */ NL_SET_ERR_MSG(extack, "Table is used by another VRF"); res = -EBUSY; goto unlock; } link_vrf: users = ++me->users; if (users == 2) ++vmap->shared_tables; list_add(&vrf->me_list, &me->vrf_list); res = 0; unlock: vrf_map_unlock(vmap); /* clean-up, if needed */ if (free_new_me) vrf_map_elem_free(new_me); return res; } /* called with rtnl lock held */ static void vrf_map_unregister_dev(struct net_device *dev) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); u32 table_id = vrf->tb_id; struct vrf_map_elem *me; int users; vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) goto unlock; list_del(&vrf->me_list); users = --me->users; if (users == 1) { --vmap->shared_tables; } else if (users == 0) { vrf_map_del_elem(me); /* no one will refer to this element anymore */ vrf_map_elem_free(me); } unlock: vrf_map_unlock(vmap); } /* return the vrf device index associated with the table_id */ static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id) { struct vrf_map *vmap = netns_vrf_map(net); struct vrf_map_elem *me; int ifindex; vrf_map_lock(vmap); if (!vmap->strict_mode) { ifindex = -EPERM; goto unlock; } me = vrf_map_lookup_elem(vmap, table_id); if (!me) { ifindex = -ENODEV; goto unlock; } ifindex = vrf_map_elem_get_vrf_ifindex(me); unlock: vrf_map_unlock(vmap); return ifindex; } /* by default VRF devices do not have a qdisc and are expected * to be created with only a single queue. */ static bool qdisc_tx_is_default(const struct net_device *dev) { struct netdev_queue *txq; struct Qdisc *qdisc; if (dev->num_tx_queues > 1) return false; txq = netdev_get_tx_queue(dev, 0); qdisc = rcu_access_pointer(txq->qdisc); return !qdisc->enqueue; } /* Local traffic destined to local address. Reinsert the packet to rx * path, similar to loopback handling. */ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst) { int len = skb->len; skb_orphan(skb); skb_dst_set(skb, dst); /* set pkt_type to avoid skb hitting packet taps twice - * once on Tx and again in Rx processing */ skb->pkt_type = PACKET_LOOPBACK; skb->protocol = eth_type_trans(skb, dev); if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) { vrf_rx_stats(dev, len); } else { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); u64_stats_inc(&dstats->rx_drops); u64_stats_update_end(&dstats->syncp); } return NETDEV_TX_OK; } static void vrf_nf_set_untracked(struct sk_buff *skb) { if (skb_get_nfct(skb) == 0) nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } static void vrf_nf_reset_ct(struct sk_buff *skb) { if (skb_get_nfct(skb) == IP_CT_UNTRACKED) nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) static int vrf_ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { const struct ipv6hdr *iph; struct net *net = dev_net(skb->dev); struct flowi6 fl6; int ret = NET_XMIT_DROP; struct dst_entry *dst; struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) goto err; iph = ipv6_hdr(skb); memset(&fl6, 0, sizeof(fl6)); /* needed to match OIF rule */ fl6.flowi6_l3mdev = dev->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = iph->nexthdr; dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst) || dst == dst_null) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (dst->dev == dev) return vrf_local_xmit(skb, dev, dst); skb_dst_set(skb, dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); ret = vrf_ip6_local_out(net, skb->sk, skb); if (unlikely(net_xmit_eval(ret))) dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; return ret; err: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #else static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #endif /* based on ip_local_out; can't use it b/c the dst is switched pointing to us */ static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, struct net_device *vrf_dev) { struct iphdr *ip4h; int ret = NET_XMIT_DROP; struct flowi4 fl4; struct net *net = dev_net(vrf_dev); struct rtable *rt; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) goto err; ip4h = ip_hdr(skb); memset(&fl4, 0, sizeof(fl4)); /* needed to match OIF rule */ fl4.flowi4_l3mdev = vrf_dev->ifindex; fl4.flowi4_iif = LOOPBACK_IFINDEX; fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = ip4h->protocol; fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (rt->dst.dev == vrf_dev) return vrf_local_xmit(skb, vrf_dev, &rt->dst); skb_dst_set(skb, &rt->dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); if (!ip4h->saddr) { ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, RT_SCOPE_LINK); } memset(IPCB(skb), 0, sizeof(*IPCB(skb))); ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(net_xmit_eval(ret))) vrf_dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; out: return ret; err: vrf_tx_error(vrf_dev, skb); goto out; } static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) { switch (skb->protocol) { case htons(ETH_P_IP): return vrf_process_v4_outbound(skb, dev); case htons(ETH_P_IPV6): return vrf_process_v6_outbound(skb, dev); default: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } } static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); int len = skb->len; netdev_tx_t ret = is_ip_tx_frame(skb, dev); u64_stats_update_begin(&dstats->syncp); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { u64_stats_inc(&dstats->tx_packets); u64_stats_add(&dstats->tx_bytes, len); } else { u64_stats_inc(&dstats->tx_drops); } u64_stats_update_end(&dstats->syncp); return ret; } static void vrf_finish_direct(struct sk_buff *skb) { struct net_device *vrf_dev = skb->dev; if (!list_empty(&vrf_dev->ptype_all) && likely(skb_headroom(skb) >= ETH_HLEN)) { struct ethhdr *eth = skb_push(skb, ETH_HLEN); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth_zero_addr(eth->h_dest); eth->h_proto = skb->protocol; rcu_read_lock_bh(); dev_queue_xmit_nit(skb, vrf_dev); rcu_read_unlock_bh(); skb_pull(skb, ETH_HLEN); } vrf_nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) /* modelled after ip6_finish_output2 */ static int vrf_finish_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; const struct in6_addr *nexthop; struct neighbour *neigh; int ret; vrf_nf_reset_ct(skb); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; rcu_read_lock(); nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); rcu_read_unlock(); return ret; } rcu_read_unlock(); IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EINVAL; } /* modelled after ip6_output */ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb_dst(skb)->dev, vrf_finish_output6, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rt6_info *rt6; rcu_read_lock(); rt6 = rcu_dereference(vrf->rt6); if (likely(rt6)) { dst = &rt6->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output6_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip6_local_out(net, sk, skb); } static int vrf_output6_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IPV6); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output6_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip6_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); if (likely(err == 1)) err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert link scope packets */ if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return vrf_ip6_out_direct(vrf_dev, sk, skb); return vrf_ip6_out_redirect(vrf_dev, skb); } /* holding rtnl */ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { struct rt6_info *rt6 = rtnl_dereference(vrf->rt6); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rt6, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rt6) { dst = &rt6->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rt6_create(struct net_device *dev) { int flags = DST_NOPOLICY | DST_NOXFRM; struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); struct rt6_info *rt6; int rc = -ENOMEM; /* IPv6 can be CONFIG enabled and then disabled runtime */ if (!ipv6_mod_enabled()) return 0; vrf->fib6_table = fib6_new_table(net, vrf->tb_id); if (!vrf->fib6_table) goto out; /* create a dst for routing packets out a VRF device */ rt6 = ip6_dst_alloc(net, dev, flags); if (!rt6) goto out; rt6->dst.output = vrf_output6; rcu_assign_pointer(vrf->rt6, rt6); rc = 0; out: return rc; } #else static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { return skb; } static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { } static int vrf_rt6_create(struct net_device *dev) { return 0; } #endif /* modelled after ip_finish_output2 */ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; vrf_nf_reset_ct(skb); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) { dev->stats.tx_errors++; return -ENOMEM; } } rcu_read_lock(); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int ret; sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */ ret = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock(); return ret; } rcu_read_unlock(); vrf_tx_error(skb->dev, skb); return -EINVAL; } static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, vrf_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rtable *rth; rcu_read_lock(); rth = rcu_dereference(vrf->rth); if (likely(rth)) { dst = &rth->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip_local_out(net, sk, skb); } static int vrf_output_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IP); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip_out_direct_finish); if (likely(err == 1)) err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert multicast or local broadcast */ if (ipv4_is_multicast(ip_hdr(skb)->daddr) || ipv4_is_lbcast(ip_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return vrf_ip_out_direct(vrf_dev, sk, skb); return vrf_ip_out_redirect(vrf_dev, skb); } /* called with rcu lock held */ static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_out(vrf_dev, sk, skb); case AF_INET6: return vrf_ip6_out(vrf_dev, sk, skb); } return skb; } /* holding rtnl */ static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf) { struct rtable *rth = rtnl_dereference(vrf->rth); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rth, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rth) { dst = &rth->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rtable_create(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); struct rtable *rth; if (!fib_new_table(dev_net(dev), vrf->tb_id)) return -ENOMEM; /* create a dst for routing packets out through a VRF device */ rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1); if (!rth) return -ENOMEM; rth->dst.output = vrf_output; rcu_assign_pointer(vrf->rth, rth); return 0; } /**************************** device handling ********************/ /* cycle interface to flush neighbor cache and move routes across tables */ static void cycle_netdev(struct net_device *dev, struct netlink_ext_ack *extack) { unsigned int flags = dev->flags; int ret; if (!netif_running(dev)) return; ret = dev_change_flags(dev, flags & ~IFF_UP, extack); if (ret >= 0) ret = dev_change_flags(dev, flags, extack); if (ret < 0) { netdev_err(dev, "Failed to cycle device %s; route tables might be wrong!\n", dev->name); } } static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { int ret; /* do not allow loopback device to be enslaved to a VRF. * The vrf device acts as the loopback for the vrf. */ if (port_dev == dev_net(dev)->loopback_dev) { NL_SET_ERR_MSG(extack, "Can not enslave loopback device to a VRF"); return -EOPNOTSUPP; } port_dev->priv_flags |= IFF_L3MDEV_SLAVE; ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack); if (ret < 0) goto err; cycle_netdev(port_dev, extack); return 0; err: port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; return ret; } static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { if (netif_is_l3_master(port_dev)) { NL_SET_ERR_MSG(extack, "Can not enslave an L3 master device to a VRF"); return -EINVAL; } if (netif_is_l3_slave(port_dev)) return -EINVAL; return do_vrf_add_slave(dev, port_dev, extack); } /* inverse of do_vrf_add_slave */ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { netdev_upper_dev_unlink(port_dev, dev); port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; cycle_netdev(port_dev, NULL); return 0; } static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { return do_vrf_del_slave(dev, port_dev); } static void vrf_dev_uninit(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); vrf_rtable_release(dev, vrf); vrf_rt6_release(dev, vrf); } static int vrf_dev_init(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); /* create the default dst which points back to us */ if (vrf_rtable_create(dev) != 0) goto out_nomem; if (vrf_rt6_create(dev) != 0) goto out_rth; dev->flags = IFF_MASTER | IFF_NOARP; /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; netdev_lockdep_set_classes(dev); return 0; out_rth: vrf_rtable_release(dev, vrf); out_nomem: return -ENOMEM; } static const struct net_device_ops vrf_netdev_ops = { .ndo_init = vrf_dev_init, .ndo_uninit = vrf_dev_uninit, .ndo_start_xmit = vrf_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_add_slave = vrf_add_slave, .ndo_del_slave = vrf_del_slave, }; static u32 vrf_fib_table(const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return vrf->tb_id; } static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook, struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1) skb = NULL; /* kfree_skb(skb) handled by nf code */ return skb; } static int vrf_prepare_mac_header(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto) { struct ethhdr *eth; int err; /* in general, we do not know if there is enough space in the head of * the packet for hosting the mac header. */ err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev)); if (unlikely(err)) /* no space in the skb head */ return -ENOBUFS; __skb_push(skb, ETH_HLEN); eth = (struct ethhdr *)skb->data; skb_reset_mac_header(skb); skb_reset_mac_len(skb); /* we set the ethernet destination and the source addresses to the * address of the VRF device. */ ether_addr_copy(eth->h_dest, vrf_dev->dev_addr); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth->h_proto = htons(proto); /* the destination address of the Ethernet frame corresponds to the * address set on the VRF interface; therefore, the packet is intended * to be processed locally. */ skb->protocol = eth->h_proto; skb->pkt_type = PACKET_HOST; skb_postpush_rcsum(skb, skb->data, ETH_HLEN); skb_pull_inline(skb, ETH_HLEN); return 0; } /* prepare and add the mac header to the packet if it was not set previously. * In this way, packet sniffers such as tcpdump can parse the packet correctly. * If the mac header was already set, the original mac header is left * untouched and the function returns immediately. */ static int vrf_add_mac_header_if_unset(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto, struct net_device *orig_dev) { if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev)) return 0; return vrf_prepare_mac_header(skb, vrf_dev, proto); } #if IS_ENABLED(CONFIG_IPV6) /* neighbor handling is done with actual device; do not want * to flip skb->dev for those ndisc packets. This really fails * for multiple next protocols (e.g., NEXTHDR_HOP). But it is * a start. */ static bool ipv6_ndisc_frame(const struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); bool rc = false; if (iph->nexthdr == NEXTHDR_ICMP) { const struct icmp6hdr *icmph; struct icmp6hdr _icmph; icmph = skb_header_pointer(skb, sizeof(*iph), sizeof(_icmph), &_icmph); if (!icmph) goto out; switch (icmph->icmp6_type) { case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: rc = true; break; } } out: return rc; } static struct rt6_info *vrf_ip6_route_lookup(struct net *net, const struct net_device *dev, struct flowi6 *fl6, int ifindex, const struct sk_buff *skb, int flags) { struct net_vrf *vrf = netdev_priv(dev); return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags); } static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, int ifindex) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct flowi6 fl6 = { .flowi6_iif = ifindex, .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), }; struct net *net = dev_net(vrf_dev); struct rt6_info *rt6; rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb, RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); if (unlikely(!rt6)) return; if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst)) return; skb_dst_set(skb, &rt6->dst); } static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); bool is_ndisc = ipv6_ndisc_frame(skb); /* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. * For non-loopback strict packets, determine the dst using the original * ifindex. */ if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; if (skb->pkt_type == PACKET_LOOPBACK) skb->pkt_type = PACKET_HOST; else vrf_ip6_input_dst(skb, vrf_dev, orig_iif); goto out; } /* if packet is NDISC then keep the ingress interface */ if (!is_ndisc) { struct net_device *orig_dev = skb->dev; vrf_rx_stats(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IPV6, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } IP6CB(skb)->flags |= IP6SKB_L3SLAVE; } if (need_strict) vrf_ip6_input_dst(skb, vrf_dev, orig_iif); skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } #else static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { return skb; } #endif static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_device *orig_dev = skb->dev; skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IPCB(skb)->flags |= IPSKB_L3SLAVE; if (ipv4_is_multicast(ip_hdr(skb)->daddr)) goto out; /* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb */ if (skb->pkt_type == PACKET_LOOPBACK) { skb->pkt_type = PACKET_HOST; goto out; } vrf_rx_stats(vrf_dev, skb->len); if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } /* called with rcu lock held */ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_rcv(vrf_dev, skb); case AF_INET6: return vrf_ip6_rcv(vrf_dev, skb); } return skb; } #if IS_ENABLED(CONFIG_IPV6) /* send to link-local or multicast address via interface enslaved to * VRF device. Force lookup to VRF table without changing flow struct * Note: Caller to this function must hold rcu_read_lock() and no refcnt * is taken on the dst by this function. */ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, struct flowi6 *fl6) { struct net *net = dev_net(dev); int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; struct dst_entry *dst = NULL; struct rt6_info *rt; /* VRF device does not have a link-local address and * sending packets to link-local or mcast addresses over * a VRF device does not make sense */ if (fl6->flowi6_oif == dev->ifindex) { dst = &net->ipv6.ip6_null_entry->dst; return dst; } if (!ipv6_addr_any(&fl6->saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags); if (rt) dst = &rt->dst; return dst; } #endif static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_fib_table = vrf_fib_table, .l3mdev_l3_rcv = vrf_l3_rcv, .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) .l3mdev_link_scope_lookup = vrf_link_scope_lookup, #endif }; static void vrf_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); } static const struct ethtool_ops vrf_ethtool_ops = { .get_drvinfo = vrf_get_drvinfo, }; static inline size_t vrf_fib_rule_nl_size(void) { size_t sz; sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)); sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */ sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */ sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */ return sz; } static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) { struct fib_rule_hdr *frh; struct nlmsghdr *nlh; struct sk_buff *skb; int err; if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) && !ipv6_mod_enabled()) return 0; skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0); if (!nlh) goto nla_put_failure; /* rule only needs to appear once */ nlh->nlmsg_flags |= NLM_F_EXCL; frh = nlmsg_data(nlh); memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL)) goto nla_put_failure; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF)) goto nla_put_failure; nlmsg_end(skb, nlh); /* fib_nl_{new,del}rule handling looks for net from skb->sk */ skb->sk = dev_net(dev)->rtnl; if (add_it) { err = fib_nl_newrule(skb, nlh, NULL); if (err == -EEXIST) err = 0; } else { err = fib_nl_delrule(skb, nlh, NULL); if (err == -ENOENT) err = 0; } nlmsg_free(skb); return err; nla_put_failure: nlmsg_free(skb); return -EMSGSIZE; } static int vrf_add_fib_rules(const struct net_device *dev) { int err; err = vrf_fib_rule(dev, AF_INET, true); if (err < 0) goto out_err; err = vrf_fib_rule(dev, AF_INET6, true); if (err < 0) goto ipv6_err; #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true); if (err < 0) goto ipmr_err; #endif #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); if (err < 0) goto ip6mr_err; #endif return 0; #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) ip6mr_err: vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); #endif #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) ipmr_err: vrf_fib_rule(dev, AF_INET6, false); #endif ipv6_err: vrf_fib_rule(dev, AF_INET, false); out_err: netdev_err(dev, "Failed to add FIB rules.\n"); return err; } static void vrf_setup(struct net_device *dev) { ether_setup(dev); /* Initialize the device structure. */ dev->netdev_ops = &vrf_netdev_ops; dev->l3mdev_ops = &vrf_l3mdev_ops; dev->ethtool_ops = &vrf_ethtool_ops; dev->needs_free_netdev = true; /* Fill in device structure with ethernet-generic values. */ eth_hw_addr_random(dev); /* don't acquire vrf device's netif_tx_lock when transmitting */ dev->lltx = true; /* don't allow vrf devices to change network namespaces. */ dev->netns_local = true; /* does not make sense for a VLAN to be added to a vrf device */ dev->features |= NETIF_F_VLAN_CHALLENGED; /* enable offload features */ dev->features |= NETIF_F_GSO_SOFTWARE; dev->features |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC; dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->hw_features = dev->features; dev->hw_enc_features = dev->features; /* default to no qdisc; user can add if desired */ dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_NO_RX_HANDLER; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; /* VRF devices do not care about MTU, but if the MTU is set * too low then the ipv4 and ipv6 protocols are disabled * which breaks networking. */ dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU; dev->mtu = dev->max_mtu; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EADDRNOTAVAIL; } } return 0; } static void vrf_dellink(struct net_device *dev, struct list_head *head) { struct net_device *port_dev; struct list_head *iter; netdev_for_each_lower_dev(dev, port_dev, iter) vrf_del_slave(dev, port_dev); vrf_map_unregister_dev(dev); unregister_netdevice_queue(dev, head); } static int vrf_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net_vrf *vrf = netdev_priv(dev); struct netns_vrf *nn_vrf; bool *add_fib_rules; struct net *net; int err; if (!data || !data[IFLA_VRF_TABLE]) { NL_SET_ERR_MSG(extack, "VRF table id is missing"); return -EINVAL; } vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); if (vrf->tb_id == RT_TABLE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE], "Invalid VRF table id"); return -EINVAL; } dev->priv_flags |= IFF_L3MDEV_MASTER; err = register_netdevice(dev); if (err) goto out; /* mapping between table_id and vrf; * note: such binding could not be done in the dev init function * because dev->ifindex id is not available yet. */ vrf->ifindex = dev->ifindex; err = vrf_map_register_dev(dev, extack); if (err) { unregister_netdevice(dev); goto out; } net = dev_net(dev); nn_vrf = net_generic(net, vrf_net_id); add_fib_rules = &nn_vrf->add_fib_rules; if (*add_fib_rules) { err = vrf_add_fib_rules(dev); if (err) { vrf_map_unregister_dev(dev); unregister_netdevice(dev); goto out; } *add_fib_rules = false; } out: return err; } static size_t vrf_nl_getsize(const struct net_device *dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */ } static int vrf_fillinfo(struct sk_buff *skb, const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id); } static size_t vrf_get_slave_size(const struct net_device *bond_dev, const struct net_device *slave_dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_PORT_TABLE */ } static int vrf_fill_slave_info(struct sk_buff *skb, const struct net_device *vrf_dev, const struct net_device *slave_dev) { struct net_vrf *vrf = netdev_priv(vrf_dev); if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id)) return -EMSGSIZE; return 0; } static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = { [IFLA_VRF_TABLE] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vrf_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct net_vrf), .get_size = vrf_nl_getsize, .policy = vrf_nl_policy, .validate = vrf_validate, .fill_info = vrf_fillinfo, .get_slave_size = vrf_get_slave_size, .fill_slave_info = vrf_fill_slave_info, .newlink = vrf_newlink, .dellink = vrf_dellink, .setup = vrf_setup, .maxtype = IFLA_VRF_MAX, }; static int vrf_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* only care about unregister events to drop slave references */ if (event == NETDEV_UNREGISTER) { struct net_device *vrf_dev; if (!netif_is_l3_slave(dev)) goto out; vrf_dev = netdev_master_upper_dev_get(dev); vrf_del_slave(vrf_dev, dev); } out: return NOTIFY_DONE; } static struct notifier_block vrf_notifier_block __read_mostly = { .notifier_call = vrf_device_event, }; static int vrf_map_init(struct vrf_map *vmap) { spin_lock_init(&vmap->vmap_lock); hash_init(vmap->ht); vmap->strict_mode = false; return 0; } #ifdef CONFIG_SYSCTL static bool vrf_strict_mode(struct vrf_map *vmap) { bool strict_mode; vrf_map_lock(vmap); strict_mode = vmap->strict_mode; vrf_map_unlock(vmap); return strict_mode; } static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode) { bool *cur_mode; int res = 0; vrf_map_lock(vmap); cur_mode = &vmap->strict_mode; if (*cur_mode == new_mode) goto unlock; if (*cur_mode) { /* disable strict mode */ *cur_mode = false; } else { if (vmap->shared_tables) { /* we cannot allow strict_mode because there are some * vrfs that share one or more tables. */ res = -EBUSY; goto unlock; } /* no tables are shared among vrfs, so we can go back * to 1:1 association between a vrf with its table. */ *cur_mode = true; } unlock: vrf_map_unlock(vmap); return res; } static int vrf_shared_table_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)table->extra1; struct vrf_map *vmap = netns_vrf_map(net); int proc_strict_mode = 0; struct ctl_table tmp = { .procname = table->procname, .data = &proc_strict_mode, .maxlen = sizeof(int), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }; int ret; if (!write) proc_strict_mode = vrf_strict_mode(vmap); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode); return ret; } static const struct ctl_table vrf_table[] = { { .procname = "strict_mode", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = vrf_shared_table_handler, /* set by the vrf_netns_init */ .extra1 = NULL, }, }; static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { struct ctl_table *table; table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL); if (!table) return -ENOMEM; /* init the extra1 parameter with the reference to current netns */ table[0].extra1 = net; nn_vrf->ctl_hdr = register_net_sysctl_sz(net, "net/vrf", table, ARRAY_SIZE(vrf_table)); if (!nn_vrf->ctl_hdr) { kfree(table); return -ENOMEM; } return 0; } static void vrf_netns_exit_sysctl(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); const struct ctl_table *table; table = nn_vrf->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(nn_vrf->ctl_hdr); kfree(table); } #else static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { return 0; } static void vrf_netns_exit_sysctl(struct net *net) { } #endif /* Initialize per network namespace state */ static int __net_init vrf_netns_init(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); nn_vrf->add_fib_rules = true; vrf_map_init(&nn_vrf->vmap); return vrf_netns_init_sysctl(net, nn_vrf); } static void __net_exit vrf_netns_exit(struct net *net) { vrf_netns_exit_sysctl(net); } static struct pernet_operations vrf_net_ops __net_initdata = { .init = vrf_netns_init, .exit = vrf_netns_exit, .id = &vrf_net_id, .size = sizeof(struct netns_vrf), }; static int __init vrf_init_module(void) { int rc; register_netdevice_notifier(&vrf_notifier_block); rc = register_pernet_subsys(&vrf_net_ops); if (rc < 0) goto error; rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); if (rc < 0) goto unreg_pernet; rc = rtnl_link_register(&vrf_link_ops); if (rc < 0) goto table_lookup_unreg; return 0; table_lookup_unreg: l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); unreg_pernet: unregister_pernet_subsys(&vrf_net_ops); error: unregister_netdevice_notifier(&vrf_notifier_block); return rc; } module_init(vrf_init_module); MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); MODULE_VERSION(DRV_VERSION);
14 13 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 // SPDX-License-Identifier: GPL-2.0-or-later /* * Handle bridge arp/nd proxy/suppress * * Copyright (C) 2017 Cumulus Networks * Copyright (c) 2017 Roopa Prabhu <roopa@cumulusnetworks.com> * * Authors: * Roopa Prabhu <roopa@cumulusnetworks.com> */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/neighbour.h> #include <net/arp.h> #include <linux/if_vlan.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <net/ipv6_stubs.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ip6_checksum.h> #endif #include "br_private.h" void br_recalculate_neigh_suppress_enabled(struct net_bridge *br) { struct net_bridge_port *p; bool neigh_suppress = false; list_for_each_entry(p, &br->port_list, list) { if (p->flags & (BR_NEIGH_SUPPRESS | BR_NEIGH_VLAN_SUPPRESS)) { neigh_suppress = true; break; } } br_opt_toggle(br, BROPT_NEIGH_SUPPRESS_ENABLED, neigh_suppress); } #if IS_ENABLED(CONFIG_INET) static void br_arp_send(struct net_bridge *br, struct net_bridge_port *p, struct net_device *dev, __be32 dest_ip, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw, __be16 vlan_proto, u16 vlan_tci) { struct net_bridge_vlan_group *vg; struct sk_buff *skb; u16 pvid; netdev_dbg(dev, "arp send dev %s dst %pI4 dst_hw %pM src %pI4 src_hw %pM\n", dev->name, &dest_ip, dest_hw, &src_ip, src_hw); if (!vlan_tci) { arp_send(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); return; } skb = arp_create(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); if (!skb) return; if (p) vg = nbp_vlan_group_rcu(p); else vg = br_vlan_group_rcu(br); pvid = br_get_pvid(vg); if (pvid == (vlan_tci & VLAN_VID_MASK)) vlan_tci = 0; if (vlan_tci) __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); if (p) { arp_xmit(skb); } else { skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_HOST; netif_rx(skb); } } static int br_chk_addr_ip(struct net_device *dev, struct netdev_nested_priv *priv) { __be32 ip = *(__be32 *)priv->data; struct in_device *in_dev; __be32 addr = 0; in_dev = __in_dev_get_rcu(dev); if (in_dev) addr = inet_confirm_addr(dev_net(dev), in_dev, 0, ip, RT_SCOPE_HOST); if (addr == ip) return 1; return 0; } static bool br_is_local_ip(struct net_device *dev, __be32 ip) { struct netdev_nested_priv priv = { .data = (void *)&ip, }; if (br_chk_addr_ip(dev, &priv)) return true; /* check if ip is configured on upper dev */ if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &priv)) return true; return false; } void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, u16 vid, struct net_bridge_port *p) { struct net_device *dev = br->dev; struct net_device *vlandev = dev; struct neighbour *n; struct arphdr *parp; u8 *arpptr, *sha; __be32 sip, tip; BR_INPUT_SKB_CB(skb)->proxyarp_replied = 0; if ((dev->flags & IFF_NOARP) || !pskb_may_pull(skb, arp_hdr_len(dev))) return; parp = arp_hdr(skb); if (parp->ar_pro != htons(ETH_P_IP) || parp->ar_hln != dev->addr_len || parp->ar_pln != 4) return; arpptr = (u8 *)parp + sizeof(struct arphdr); sha = arpptr; arpptr += dev->addr_len; /* sha */ memcpy(&sip, arpptr, sizeof(sip)); arpptr += sizeof(sip); arpptr += dev->addr_len; /* tha */ memcpy(&tip, arpptr, sizeof(tip)); if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) return; if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { if (br_is_neigh_suppress_enabled(p, vid)) return; if (parp->ar_op != htons(ARPOP_RREQUEST) && parp->ar_op != htons(ARPOP_RREPLY) && (ipv4_is_zeronet(sip) || sip == tip)) { /* prevent flooding to neigh suppress ports */ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } } if (parp->ar_op != htons(ARPOP_REQUEST)) return; if (vid != 0) { vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto, vid); if (!vlandev) return; } if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && br_is_local_ip(vlandev, tip)) { /* its our local ip, so don't proxy reply * and don't forward to neigh suppress ports */ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } n = neigh_lookup(&arp_tbl, &tip, vlandev); if (n) { struct net_bridge_fdb_entry *f; if (!(READ_ONCE(n->nud_state) & NUD_VALID)) { neigh_release(n); return; } f = br_fdb_find_rcu(br, n->ha, vid); if (f) { bool replied = false; if ((p && (p->flags & BR_PROXYARP)) || (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)) || br_is_neigh_suppress_enabled(f->dst, vid)) { if (!vid) br_arp_send(br, p, skb->dev, sip, tip, sha, n->ha, sha, 0, 0); else br_arp_send(br, p, skb->dev, sip, tip, sha, n->ha, sha, skb->vlan_proto, skb_vlan_tag_get(skb)); replied = true; } /* If we have replied or as long as we know the * mac, indicate to arp replied */ if (replied || br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; } neigh_release(n); } } #endif #if IS_ENABLED(CONFIG_IPV6) struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *msg) { struct nd_msg *m; m = skb_header_pointer(skb, skb_network_offset(skb) + sizeof(struct ipv6hdr), sizeof(*msg), msg); if (!m) return NULL; if (m->icmph.icmp6_code != 0 || (m->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION && m->icmph.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT)) return NULL; return m; } static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, struct sk_buff *request, struct neighbour *n, __be16 vlan_proto, u16 vlan_tci, struct nd_msg *ns) { struct net_device *dev = request->dev; struct net_bridge_vlan_group *vg; struct sk_buff *reply; struct nd_msg *na; struct ipv6hdr *pip6; int na_olen = 8; /* opt hdr + ETH_ALEN for target */ int ns_olen; int i, len; u8 *daddr; u16 pvid; if (!dev) return; len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + sizeof(*na) + na_olen + dev->needed_tailroom; reply = alloc_skb(len, GFP_ATOMIC); if (!reply) return; reply->protocol = htons(ETH_P_IPV6); reply->dev = dev; skb_reserve(reply, LL_RESERVED_SPACE(dev)); skb_push(reply, sizeof(struct ethhdr)); skb_set_mac_header(reply, 0); daddr = eth_hdr(request)->h_source; /* Do we need option processing ? */ ns_olen = request->len - (skb_network_offset(request) + sizeof(struct ipv6hdr)) - sizeof(*ns); for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) { if (!ns->opt[i + 1]) { kfree_skb(reply); return; } if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { daddr = ns->opt + i + sizeof(struct nd_opt_hdr); break; } } /* Ethernet header */ ether_addr_copy(eth_hdr(reply)->h_dest, daddr); ether_addr_copy(eth_hdr(reply)->h_source, n->ha); eth_hdr(reply)->h_proto = htons(ETH_P_IPV6); reply->protocol = htons(ETH_P_IPV6); skb_pull(reply, sizeof(struct ethhdr)); skb_set_network_header(reply, 0); skb_put(reply, sizeof(struct ipv6hdr)); /* IPv6 header */ pip6 = ipv6_hdr(reply); memset(pip6, 0, sizeof(struct ipv6hdr)); pip6->version = 6; pip6->priority = ipv6_hdr(request)->priority; pip6->nexthdr = IPPROTO_ICMPV6; pip6->hop_limit = 255; pip6->daddr = ipv6_hdr(request)->saddr; pip6->saddr = *(struct in6_addr *)n->primary_key; skb_pull(reply, sizeof(struct ipv6hdr)); skb_set_transport_header(reply, 0); na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen); /* Neighbor Advertisement */ memset(na, 0, sizeof(*na) + na_olen); na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; na->icmph.icmp6_router = (n->flags & NTF_ROUTER) ? 1 : 0; na->icmph.icmp6_override = 1; na->icmph.icmp6_solicited = 1; na->target = ns->target; ether_addr_copy(&na->opt[2], n->ha); na->opt[0] = ND_OPT_TARGET_LL_ADDR; na->opt[1] = na_olen >> 3; na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, sizeof(*na) + na_olen, IPPROTO_ICMPV6, csum_partial(na, sizeof(*na) + na_olen, 0)); pip6->payload_len = htons(sizeof(*na) + na_olen); skb_push(reply, sizeof(struct ipv6hdr)); skb_push(reply, sizeof(struct ethhdr)); reply->ip_summed = CHECKSUM_UNNECESSARY; if (p) vg = nbp_vlan_group_rcu(p); else vg = br_vlan_group_rcu(br); pvid = br_get_pvid(vg); if (pvid == (vlan_tci & VLAN_VID_MASK)) vlan_tci = 0; if (vlan_tci) __vlan_hwaccel_put_tag(reply, vlan_proto, vlan_tci); netdev_dbg(dev, "nd send dev %s dst %pI6 dst_hw %pM src %pI6 src_hw %pM\n", dev->name, &pip6->daddr, daddr, &pip6->saddr, n->ha); if (p) { dev_queue_xmit(reply); } else { skb_reset_mac_header(reply); __skb_pull(reply, skb_network_offset(reply)); reply->ip_summed = CHECKSUM_UNNECESSARY; reply->pkt_type = PACKET_HOST; netif_rx(reply); } } static int br_chk_addr_ip6(struct net_device *dev, struct netdev_nested_priv *priv) { struct in6_addr *addr = (struct in6_addr *)priv->data; if (ipv6_chk_addr(dev_net(dev), addr, dev, 0)) return 1; return 0; } static bool br_is_local_ip6(struct net_device *dev, struct in6_addr *addr) { struct netdev_nested_priv priv = { .data = (void *)addr, }; if (br_chk_addr_ip6(dev, &priv)) return true; /* check if ip is configured on upper dev */ if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, &priv)) return true; return false; } void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, u16 vid, struct net_bridge_port *p, struct nd_msg *msg) { struct net_device *dev = br->dev; struct net_device *vlandev = NULL; struct in6_addr *saddr, *daddr; struct ipv6hdr *iphdr; struct neighbour *n; BR_INPUT_SKB_CB(skb)->proxyarp_replied = 0; if (br_is_neigh_suppress_enabled(p, vid)) return; if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT && !msg->icmph.icmp6_solicited) { /* prevent flooding to neigh suppress ports */ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) return; iphdr = ipv6_hdr(skb); saddr = &iphdr->saddr; daddr = &iphdr->daddr; if (ipv6_addr_any(saddr) || !ipv6_addr_cmp(saddr, daddr)) { /* prevent flooding to neigh suppress ports */ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } if (vid != 0) { /* build neigh table lookup on the vlan device */ vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto, vid); if (!vlandev) return; } else { vlandev = dev; } if (br_is_local_ip6(vlandev, &msg->target)) { /* its our own ip, so don't proxy reply * and don't forward to arp suppress ports */ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; } n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, vlandev); if (n) { struct net_bridge_fdb_entry *f; if (!(READ_ONCE(n->nud_state) & NUD_VALID)) { neigh_release(n); return; } f = br_fdb_find_rcu(br, n->ha, vid); if (f) { bool replied = false; if (br_is_neigh_suppress_enabled(f->dst, vid)) { if (vid != 0) br_nd_send(br, p, skb, n, skb->vlan_proto, skb_vlan_tag_get(skb), msg); else br_nd_send(br, p, skb, n, 0, 0, msg); replied = true; } /* If we have replied or as long as we know the * mac, indicate to NEIGH_SUPPRESS ports that we * have replied */ if (replied || br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; } neigh_release(n); } } #endif bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid) { if (!p) return false; if (!vid) return !!(p->flags & BR_NEIGH_SUPPRESS); if (p->flags & BR_NEIGH_VLAN_SUPPRESS) { struct net_bridge_vlan_group *vg = nbp_vlan_group_rcu(p); struct net_bridge_vlan *v; v = br_vlan_find(vg, vid); if (!v) return false; return !!(v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED); } else { return !!(p->flags & BR_NEIGH_SUPPRESS); } }
4 7 7 7 7 6 116 7 7 7 7 7 7 7 7 7 7 7 6 7 7 7 4 6 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 IBM Corporation * * Author: Mimi Zohar <zohar@us.ibm.com> * * File: ima_api.c * Implements must_appraise_or_measure, collect_measurement, * appraise_measurement, store_measurement and store_template. */ #include <linux/slab.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/evm.h> #include <linux/fsverity.h> #include "ima.h" /* * ima_free_template_entry - free an existing template entry */ void ima_free_template_entry(struct ima_template_entry *entry) { int i; for (i = 0; i < entry->template_desc->num_fields; i++) kfree(entry->template_data[i].data); kfree(entry->digests); kfree(entry); } /* * ima_alloc_init_template - create and initialize a new template entry */ int ima_alloc_init_template(struct ima_event_data *event_data, struct ima_template_entry **entry, struct ima_template_desc *desc) { struct ima_template_desc *template_desc; struct tpm_digest *digests; int i, result = 0; if (desc) template_desc = desc; else template_desc = ima_template_desc_current(); *entry = kzalloc(struct_size(*entry, template_data, template_desc->num_fields), GFP_NOFS); if (!*entry) return -ENOMEM; digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, sizeof(*digests), GFP_NOFS); if (!digests) { kfree(*entry); *entry = NULL; return -ENOMEM; } (*entry)->digests = digests; (*entry)->template_desc = template_desc; for (i = 0; i < template_desc->num_fields; i++) { const struct ima_template_field *field = template_desc->fields[i]; u32 len; result = field->field_init(event_data, &((*entry)->template_data[i])); if (result != 0) goto out; len = (*entry)->template_data[i].len; (*entry)->template_data_len += sizeof(len); (*entry)->template_data_len += len; } return 0; out: ima_free_template_entry(*entry); *entry = NULL; return result; } /* * ima_store_template - store ima template measurements * * Calculate the hash of a template entry, add the template entry * to an ordered list of measurement entries maintained inside the kernel, * and also update the aggregate integrity value (maintained inside the * configured TPM PCR) over the hashes of the current list of measurement * entries. * * Applications retrieve the current kernel-held measurement list through * the securityfs entries in /sys/kernel/security/ima. The signed aggregate * TPM PCR (called quote) can be retrieved using a TPM user space library * and is used to validate the measurement list. * * Returns 0 on success, error code otherwise */ int ima_store_template(struct ima_template_entry *entry, int violation, struct inode *inode, const unsigned char *filename, int pcr) { static const char op[] = "add_template_measure"; static const char audit_cause[] = "hashing_error"; char *template_name = entry->template_desc->name; int result; if (!violation) { result = ima_calc_field_array_hash(&entry->template_data[0], entry); if (result < 0) { integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, template_name, op, audit_cause, result, 0); return result; } } entry->pcr = pcr; result = ima_add_template_entry(entry, violation, op, inode, filename); return result; } /* * ima_add_violation - add violation to measurement list. * * Violations are flagged in the measurement list with zero hash values. * By extending the PCR with 0xFF's instead of with zeroes, the PCR * value is invalidated. */ void ima_add_violation(struct file *file, const unsigned char *filename, struct ima_iint_cache *iint, const char *op, const char *cause) { struct ima_template_entry *entry; struct inode *inode = file_inode(file); struct ima_event_data event_data = { .iint = iint, .file = file, .filename = filename, .violation = cause }; int violation = 1; int result; /* can overflow, only indicator */ atomic_long_inc(&ima_htable.violations); result = ima_alloc_init_template(&event_data, &entry, NULL); if (result < 0) { result = -ENOMEM; goto err_out; } result = ima_store_template(entry, violation, inode, filename, CONFIG_IMA_MEASURE_PCR_IDX); if (result < 0) ima_free_template_entry(entry); err_out: integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, cause, result, 0); } /** * ima_get_action - appraise & measure decision based on policy. * @idmap: idmap of the mount the inode was found from * @inode: pointer to the inode associated with the object being validated * @cred: pointer to credentials structure to validate * @prop: properties of the task being validated * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXEC, * MAY_APPEND) * @func: caller identifier * @pcr: pointer filled in if matched measure policy sets pcr= * @template_desc: pointer filled in if matched measure policy sets template= * @func_data: func specific data, may be NULL * @allowed_algos: allowlist of hash algorithms for the IMA xattr * * The policy is defined in terms of keypairs: * subj=, obj=, type=, func=, mask=, fsmagic= * subj,obj, and type: are LSM specific. * func: FILE_CHECK | BPRM_CHECK | CREDS_CHECK | MMAP_CHECK | MODULE_CHECK * | KEXEC_CMDLINE | KEY_CHECK | CRITICAL_DATA | SETXATTR_CHECK * | MMAP_CHECK_REQPROT * mask: contains the permission mask * fsmagic: hex value * * Returns IMA_MEASURE, IMA_APPRAISE mask. * */ int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, struct lsm_prop *prop, int mask, enum ima_hooks func, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos) { int flags = IMA_MEASURE | IMA_AUDIT | IMA_APPRAISE | IMA_HASH; flags &= ima_policy_flag; return ima_match_policy(idmap, inode, cred, prop, func, mask, flags, pcr, template_desc, func_data, allowed_algos); } static bool ima_get_verity_digest(struct ima_iint_cache *iint, struct inode *inode, struct ima_max_digest_data *hash) { enum hash_algo alg; int digest_len; /* * On failure, 'measure' policy rules will result in a file data * hash containing 0's. */ digest_len = fsverity_get_digest(inode, hash->digest, NULL, &alg); if (digest_len == 0) return false; /* * Unlike in the case of actually calculating the file hash, in * the fsverity case regardless of the hash algorithm, return * the verity digest to be included in the measurement list. A * mismatch between the verity algorithm and the xattr signature * algorithm, if one exists, will be detected later. */ hash->hdr.algo = alg; hash->hdr.length = digest_len; return true; } /* * ima_collect_measurement - collect file measurement * * Calculate the file hash, if it doesn't already exist, * storing the measurement and i_version in the iint. * * Must be called with iint->mutex held. * * Return 0 on success, error code otherwise */ int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file, void *buf, loff_t size, enum hash_algo algo, struct modsig *modsig) { const char *audit_cause = "failed"; struct inode *inode = file_inode(file); struct inode *real_inode = d_real_inode(file_dentry(file)); struct ima_max_digest_data hash; struct ima_digest_data *hash_hdr = container_of(&hash.hdr, struct ima_digest_data, hdr); struct name_snapshot filename; struct kstat stat; int result = 0; int length; void *tmpbuf; u64 i_version = 0; /* * Always collect the modsig, because IMA might have already collected * the file digest without collecting the modsig in a previous * measurement rule. */ if (modsig) ima_collect_modsig(modsig, buf, size); if (iint->flags & IMA_COLLECTED) goto out; /* * Detecting file change is based on i_version. On filesystems * which do not support i_version, support was originally limited * to an initial measurement/appraisal/audit, but was modified to * assume the file changed. */ result = vfs_getattr_nosec(&file->f_path, &stat, STATX_CHANGE_COOKIE, AT_STATX_SYNC_AS_STAT); if (!result && (stat.result_mask & STATX_CHANGE_COOKIE)) i_version = stat.change_cookie; hash.hdr.algo = algo; hash.hdr.length = hash_digest_size[algo]; /* Initialize hash digest to 0's in case of failure */ memset(&hash.digest, 0, sizeof(hash.digest)); if (iint->flags & IMA_VERITY_REQUIRED) { if (!ima_get_verity_digest(iint, inode, &hash)) { audit_cause = "no-verity-digest"; result = -ENODATA; } } else if (buf) { result = ima_calc_buffer_hash(buf, size, hash_hdr); } else { result = ima_calc_file_hash(file, hash_hdr); } if (result && result != -EBADF && result != -EINVAL) goto out; length = sizeof(hash.hdr) + hash.hdr.length; tmpbuf = krealloc(iint->ima_hash, length, GFP_NOFS); if (!tmpbuf) { result = -ENOMEM; goto out; } iint->ima_hash = tmpbuf; memcpy(iint->ima_hash, &hash, length); if (real_inode == inode) iint->real_inode.version = i_version; else integrity_inode_attrs_store(&iint->real_inode, i_version, real_inode); /* Possibly temporary failure due to type of read (eg. O_DIRECT) */ if (!result) iint->flags |= IMA_COLLECTED; out: if (result) { if (file->f_flags & O_DIRECT) audit_cause = "failed(directio)"; take_dentry_name_snapshot(&filename, file->f_path.dentry); integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, filename.name.name, "collect_data", audit_cause, result, 0); release_dentry_name_snapshot(&filename); } return result; } /* * ima_store_measurement - store file measurement * * Create an "ima" template and then store the template by calling * ima_store_template. * * We only get here if the inode has not already been measured, * but the measurement could already exist: * - multiple copies of the same file on either the same or * different filesystems. * - the inode was previously flushed as well as the iint info, * containing the hashing info. * * Must be called with iint->mutex held. */ void ima_store_measurement(struct ima_iint_cache *iint, struct file *file, const unsigned char *filename, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc) { static const char op[] = "add_template_measure"; static const char audit_cause[] = "ENOMEM"; int result = -ENOMEM; struct inode *inode = file_inode(file); struct ima_template_entry *entry; struct ima_event_data event_data = { .iint = iint, .file = file, .filename = filename, .xattr_value = xattr_value, .xattr_len = xattr_len, .modsig = modsig }; int violation = 0; /* * We still need to store the measurement in the case of MODSIG because * we only have its contents to put in the list at the time of * appraisal, but a file measurement from earlier might already exist in * the measurement list. */ if (iint->measured_pcrs & (0x1 << pcr) && !modsig) return; result = ima_alloc_init_template(&event_data, &entry, template_desc); if (result < 0) { integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, audit_cause, result, 0); return; } result = ima_store_template(entry, violation, inode, filename, pcr); if ((!result || result == -EEXIST) && !(file->f_flags & O_DIRECT)) { iint->flags |= IMA_MEASURED; iint->measured_pcrs |= (0x1 << pcr); } if (result < 0) ima_free_template_entry(entry); } void ima_audit_measurement(struct ima_iint_cache *iint, const unsigned char *filename) { struct audit_buffer *ab; char *hash; const char *algo_name = hash_algo_name[iint->ima_hash->algo]; int i; if (iint->flags & IMA_AUDITED) return; hash = kzalloc((iint->ima_hash->length * 2) + 1, GFP_KERNEL); if (!hash) return; for (i = 0; i < iint->ima_hash->length; i++) hex_byte_pack(hash + (i * 2), iint->ima_hash->digest[i]); hash[i * 2] = '\0'; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_INTEGRITY_RULE); if (!ab) goto out; audit_log_format(ab, "file="); audit_log_untrustedstring(ab, filename); audit_log_format(ab, " hash=\"%s:%s\"", algo_name, hash); audit_log_task_info(ab); audit_log_end(ab); iint->flags |= IMA_AUDITED; out: kfree(hash); return; } /* * ima_d_path - return a pointer to the full pathname * * Attempt to return a pointer to the full pathname for use in the * IMA measurement list, IMA audit records, and auditing logs. * * On failure, return a pointer to a copy of the filename, not dname. * Returning a pointer to dname, could result in using the pointer * after the memory has been freed. */ const char *ima_d_path(const struct path *path, char **pathbuf, char *namebuf) { struct name_snapshot filename; char *pathname = NULL; *pathbuf = __getname(); if (*pathbuf) { pathname = d_absolute_path(path, *pathbuf, PATH_MAX); if (IS_ERR(pathname)) { __putname(*pathbuf); *pathbuf = NULL; pathname = NULL; } } if (!pathname) { take_dentry_name_snapshot(&filename, path->dentry); strscpy(namebuf, filename.name.name, NAME_MAX); release_dentry_name_snapshot(&filename); pathname = namebuf; } return pathname; }
1 1 1 1 1 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 // SPDX-License-Identifier: GPL-2.0 /* * Tty buffer allocation management */ #include <linux/types.h> #include <linux/errno.h> #include <linux/minmax.h> #include <linux/tty.h> #include <linux/tty_buffer.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/ratelimit.h> #include "tty.h" #define MIN_TTYB_SIZE 256 #define TTYB_ALIGN_MASK 0xff /* * Byte threshold to limit memory consumption for flip buffers. * The actual memory limit is > 2x this amount. */ #define TTYB_DEFAULT_MEM_LIMIT (640 * 1024UL) /* * We default to dicing tty buffer allocations to this many characters * in order to avoid multiple page allocations. We know the size of * tty_buffer itself but it must also be taken into account that the * buffer is 256 byte aligned. See tty_buffer_find for the allocation * logic this must match. */ #define TTY_BUFFER_PAGE (((PAGE_SIZE - sizeof(struct tty_buffer)) / 2) & ~TTYB_ALIGN_MASK) /** * tty_buffer_lock_exclusive - gain exclusive access to buffer * @port: tty port owning the flip buffer * * Guarantees safe use of the &tty_ldisc_ops.receive_buf() method by excluding * the buffer work and any pending flush from using the flip buffer. Data can * continue to be added concurrently to the flip buffer from the driver side. * * See also tty_buffer_unlock_exclusive(). */ void tty_buffer_lock_exclusive(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; atomic_inc(&buf->priority); mutex_lock(&buf->lock); } EXPORT_SYMBOL_GPL(tty_buffer_lock_exclusive); /** * tty_buffer_unlock_exclusive - release exclusive access * @port: tty port owning the flip buffer * * The buffer work is restarted if there is data in the flip buffer. * * See also tty_buffer_lock_exclusive(). */ void tty_buffer_unlock_exclusive(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; bool restart = buf->head->commit != buf->head->read; atomic_dec(&buf->priority); mutex_unlock(&buf->lock); if (restart) queue_work(system_unbound_wq, &buf->work); } EXPORT_SYMBOL_GPL(tty_buffer_unlock_exclusive); /** * tty_buffer_space_avail - return unused buffer space * @port: tty port owning the flip buffer * * Returns: the # of bytes which can be written by the driver without reaching * the buffer limit. * * Note: this does not guarantee that memory is available to write the returned * # of bytes (use tty_prepare_flip_string() to pre-allocate if memory * guarantee is required). */ unsigned int tty_buffer_space_avail(struct tty_port *port) { int space = port->buf.mem_limit - atomic_read(&port->buf.mem_used); return max(space, 0); } EXPORT_SYMBOL_GPL(tty_buffer_space_avail); static void tty_buffer_reset(struct tty_buffer *p, size_t size) { p->used = 0; p->size = size; p->next = NULL; p->commit = 0; p->lookahead = 0; p->read = 0; p->flags = true; } /** * tty_buffer_free_all - free buffers used by a tty * @port: tty port to free from * * Remove all the buffers pending on a tty whether queued with data or in the * free ring. Must be called when the tty is no longer in use. */ void tty_buffer_free_all(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; struct tty_buffer *p, *next; struct llist_node *llist; unsigned int freed = 0; int still_used; while ((p = buf->head) != NULL) { buf->head = p->next; freed += p->size; if (p->size > 0) kfree(p); } llist = llist_del_all(&buf->free); llist_for_each_entry_safe(p, next, llist, free) kfree(p); tty_buffer_reset(&buf->sentinel, 0); buf->head = &buf->sentinel; buf->tail = &buf->sentinel; still_used = atomic_xchg(&buf->mem_used, 0); WARN(still_used != freed, "we still have not freed %d bytes!", still_used - freed); } /** * tty_buffer_alloc - allocate a tty buffer * @port: tty port * @size: desired size (characters) * * Allocate a new tty buffer to hold the desired number of characters. We * round our buffers off in 256 character chunks to get better allocation * behaviour. * * Returns: %NULL if out of memory or the allocation would exceed the per * device queue. */ static struct tty_buffer *tty_buffer_alloc(struct tty_port *port, size_t size) { struct llist_node *free; struct tty_buffer *p; /* Round the buffer size out */ size = __ALIGN_MASK(size, TTYB_ALIGN_MASK); if (size <= MIN_TTYB_SIZE) { free = llist_del_first(&port->buf.free); if (free) { p = llist_entry(free, struct tty_buffer, free); goto found; } } /* Should possibly check if this fails for the largest buffer we * have queued and recycle that ? */ if (atomic_read(&port->buf.mem_used) > port->buf.mem_limit) return NULL; p = kmalloc(struct_size(p, data, 2 * size), GFP_ATOMIC | __GFP_NOWARN); if (p == NULL) return NULL; found: tty_buffer_reset(p, size); atomic_add(size, &port->buf.mem_used); return p; } /** * tty_buffer_free - free a tty buffer * @port: tty port owning the buffer * @b: the buffer to free * * Free a tty buffer, or add it to the free list according to our internal * strategy. */ static void tty_buffer_free(struct tty_port *port, struct tty_buffer *b) { struct tty_bufhead *buf = &port->buf; /* Dumb strategy for now - should keep some stats */ WARN_ON(atomic_sub_return(b->size, &buf->mem_used) < 0); if (b->size > MIN_TTYB_SIZE) kfree(b); else if (b->size > 0) llist_add(&b->free, &buf->free); } /** * tty_buffer_flush - flush full tty buffers * @tty: tty to flush * @ld: optional ldisc ptr (must be referenced) * * Flush all the buffers containing receive data. If @ld != %NULL, flush the * ldisc input buffer. * * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'. */ void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld) { struct tty_port *port = tty->port; struct tty_bufhead *buf = &port->buf; struct tty_buffer *next; atomic_inc(&buf->priority); mutex_lock(&buf->lock); /* paired w/ release in __tty_buffer_request_room; ensures there are * no pending memory accesses to the freed buffer */ while ((next = smp_load_acquire(&buf->head->next)) != NULL) { tty_buffer_free(port, buf->head); buf->head = next; } buf->head->read = buf->head->commit; buf->head->lookahead = buf->head->read; if (ld && ld->ops->flush_buffer) ld->ops->flush_buffer(tty); atomic_dec(&buf->priority); mutex_unlock(&buf->lock); } /** * __tty_buffer_request_room - grow tty buffer if needed * @port: tty port * @size: size desired * @flags: buffer has to store flags along character data * * Make at least @size bytes of linear space available for the tty buffer. * * Will change over to a new buffer if the current buffer is encoded as * %TTY_NORMAL (so has no flags buffer) and the new buffer requires a flags * buffer. * * Returns: the size we managed to find. */ static int __tty_buffer_request_room(struct tty_port *port, size_t size, bool flags) { struct tty_bufhead *buf = &port->buf; struct tty_buffer *n, *b = buf->tail; size_t left = (b->flags ? 1 : 2) * b->size - b->used; bool change = !b->flags && flags; if (!change && left >= size) return size; /* This is the slow path - looking for new buffers to use */ n = tty_buffer_alloc(port, size); if (n == NULL) return change ? 0 : left; n->flags = flags; buf->tail = n; /* * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs() * ensures they see all buffer data. */ smp_store_release(&b->commit, b->used); /* * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs() * ensures the latest commit value can be read before the head * is advanced to the next buffer. */ smp_store_release(&b->next, n); return size; } int tty_buffer_request_room(struct tty_port *port, size_t size) { return __tty_buffer_request_room(port, size, true); } EXPORT_SYMBOL_GPL(tty_buffer_request_room); size_t __tty_insert_flip_string_flags(struct tty_port *port, const u8 *chars, const u8 *flags, bool mutable_flags, size_t size) { bool need_flags = mutable_flags || flags[0] != TTY_NORMAL; size_t copied = 0; do { size_t goal = min_t(size_t, size - copied, TTY_BUFFER_PAGE); size_t space = __tty_buffer_request_room(port, goal, need_flags); struct tty_buffer *tb = port->buf.tail; if (unlikely(space == 0)) break; memcpy(char_buf_ptr(tb, tb->used), chars, space); if (mutable_flags) { memcpy(flag_buf_ptr(tb, tb->used), flags, space); flags += space; } else if (tb->flags) { memset(flag_buf_ptr(tb, tb->used), flags[0], space); } else { /* tb->flags should be available once requested */ WARN_ON_ONCE(need_flags); } tb->used += space; copied += space; chars += space; /* There is a small chance that we need to split the data over * several buffers. If this is the case we must loop. */ } while (unlikely(size > copied)); return copied; } EXPORT_SYMBOL(__tty_insert_flip_string_flags); /** * tty_prepare_flip_string - make room for characters * @port: tty port * @chars: return pointer for character write area * @size: desired size * * Prepare a block of space in the buffer for data. * * This is used for drivers that need their own block copy routines into the * buffer. There is no guarantee the buffer is a DMA target! * * Returns: the length available and buffer pointer (@chars) to the space which * is now allocated and accounted for as ready for normal characters. */ size_t tty_prepare_flip_string(struct tty_port *port, u8 **chars, size_t size) { size_t space = __tty_buffer_request_room(port, size, false); if (likely(space)) { struct tty_buffer *tb = port->buf.tail; *chars = char_buf_ptr(tb, tb->used); if (tb->flags) memset(flag_buf_ptr(tb, tb->used), TTY_NORMAL, space); tb->used += space; } return space; } EXPORT_SYMBOL_GPL(tty_prepare_flip_string); /** * tty_ldisc_receive_buf - forward data to line discipline * @ld: line discipline to process input * @p: char buffer * @f: %TTY_NORMAL, %TTY_BREAK, etc. flags buffer * @count: number of bytes to process * * Callers other than flush_to_ldisc() need to exclude the kworker from * concurrent use of the line discipline, see paste_selection(). * * Returns: the number of bytes processed. */ size_t tty_ldisc_receive_buf(struct tty_ldisc *ld, const u8 *p, const u8 *f, size_t count) { if (ld->ops->receive_buf2) count = ld->ops->receive_buf2(ld->tty, p, f, count); else { count = min_t(size_t, count, ld->tty->receive_room); if (count && ld->ops->receive_buf) ld->ops->receive_buf(ld->tty, p, f, count); } return count; } EXPORT_SYMBOL_GPL(tty_ldisc_receive_buf); static void lookahead_bufs(struct tty_port *port, struct tty_buffer *head) { head->lookahead = max(head->lookahead, head->read); while (head) { struct tty_buffer *next; unsigned int count; /* * Paired w/ release in __tty_buffer_request_room(); * ensures commit value read is not stale if the head * is advancing to the next buffer. */ next = smp_load_acquire(&head->next); /* * Paired w/ release in __tty_buffer_request_room() or in * tty_buffer_flush(); ensures we see the committed buffer data. */ count = smp_load_acquire(&head->commit) - head->lookahead; if (!count) { head = next; continue; } if (port->client_ops->lookahead_buf) { u8 *p, *f = NULL; p = char_buf_ptr(head, head->lookahead); if (head->flags) f = flag_buf_ptr(head, head->lookahead); port->client_ops->lookahead_buf(port, p, f, count); } head->lookahead += count; } } static size_t receive_buf(struct tty_port *port, struct tty_buffer *head, size_t count) { u8 *p = char_buf_ptr(head, head->read); const u8 *f = NULL; size_t n; if (head->flags) f = flag_buf_ptr(head, head->read); n = port->client_ops->receive_buf(port, p, f, count); if (n > 0) memset(p, 0, n); return n; } /** * flush_to_ldisc - flush data from buffer to ldisc * @work: tty structure passed from work queue. * * This routine is called out of the software interrupt to flush data from the * buffer chain to the line discipline. * * The receive_buf() method is single threaded for each tty instance. * * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'. */ static void flush_to_ldisc(struct work_struct *work) { struct tty_port *port = container_of(work, struct tty_port, buf.work); struct tty_bufhead *buf = &port->buf; mutex_lock(&buf->lock); while (1) { struct tty_buffer *head = buf->head; struct tty_buffer *next; size_t count, rcvd; /* Ldisc or user is trying to gain exclusive access */ if (atomic_read(&buf->priority)) break; /* paired w/ release in __tty_buffer_request_room(); * ensures commit value read is not stale if the head * is advancing to the next buffer */ next = smp_load_acquire(&head->next); /* paired w/ release in __tty_buffer_request_room() or in * tty_buffer_flush(); ensures we see the committed buffer data */ count = smp_load_acquire(&head->commit) - head->read; if (!count) { if (next == NULL) break; buf->head = next; tty_buffer_free(port, head); continue; } rcvd = receive_buf(port, head, count); head->read += rcvd; if (rcvd < count) lookahead_bufs(port, head); if (!rcvd) break; if (need_resched()) cond_resched(); } mutex_unlock(&buf->lock); } static inline void tty_flip_buffer_commit(struct tty_buffer *tail) { /* * Paired w/ acquire in flush_to_ldisc(); ensures flush_to_ldisc() sees * buffer data. */ smp_store_release(&tail->commit, tail->used); } /** * tty_flip_buffer_push - push terminal buffers * @port: tty port to push * * Queue a push of the terminal flip buffers to the line discipline. Can be * called from IRQ/atomic context. * * In the event of the queue being busy for flipping the work will be held off * and retried later. */ void tty_flip_buffer_push(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; tty_flip_buffer_commit(buf->tail); queue_work(system_unbound_wq, &buf->work); } EXPORT_SYMBOL(tty_flip_buffer_push); /** * tty_insert_flip_string_and_push_buffer - add characters to the tty buffer and * push * @port: tty port * @chars: characters * @size: size * * The function combines tty_insert_flip_string() and tty_flip_buffer_push() * with the exception of properly holding the @port->lock. * * To be used only internally (by pty currently). * * Returns: the number added. */ int tty_insert_flip_string_and_push_buffer(struct tty_port *port, const u8 *chars, size_t size) { struct tty_bufhead *buf = &port->buf; unsigned long flags; spin_lock_irqsave(&port->lock, flags); size = tty_insert_flip_string(port, chars, size); if (size) tty_flip_buffer_commit(buf->tail); spin_unlock_irqrestore(&port->lock, flags); queue_work(system_unbound_wq, &buf->work); return size; } /** * tty_buffer_init - prepare a tty buffer structure * @port: tty port to initialise * * Set up the initial state of the buffer management for a tty device. Must be * called before the other tty buffer functions are used. */ void tty_buffer_init(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; mutex_init(&buf->lock); tty_buffer_reset(&buf->sentinel, 0); buf->head = &buf->sentinel; buf->tail = &buf->sentinel; init_llist_head(&buf->free); atomic_set(&buf->mem_used, 0); atomic_set(&buf->priority, 0); INIT_WORK(&buf->work, flush_to_ldisc); buf->mem_limit = TTYB_DEFAULT_MEM_LIMIT; } /** * tty_buffer_set_limit - change the tty buffer memory limit * @port: tty port to change * @limit: memory limit to set * * Change the tty buffer memory limit. * * Must be called before the other tty buffer functions are used. */ int tty_buffer_set_limit(struct tty_port *port, int limit) { if (limit < MIN_TTYB_SIZE) return -EINVAL; port->buf.mem_limit = limit; return 0; } EXPORT_SYMBOL_GPL(tty_buffer_set_limit); /* slave ptys can claim nested buffer lock when handling BRK and INTR */ void tty_buffer_set_lock_subclass(struct tty_port *port) { lockdep_set_subclass(&port->buf.lock, TTY_LOCK_SLAVE); } bool tty_buffer_restart_work(struct tty_port *port) { return queue_work(system_unbound_wq, &port->buf.work); } bool tty_buffer_cancel_work(struct tty_port *port) { return cancel_work_sync(&port->buf.work); } void tty_buffer_flush_work(struct tty_port *port) { flush_work(&port->buf.work); }
1 1 68 63 20 49 47 2 47 2 115 132 118 68 115 116 115 4 117 117 115 1 8 84 2 117 38 116 1 114 115 115 3 1 43 1 2 42 3 1 4 1 34 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2003, 2004 * * This file is part of the SCTP kernel implementation * * This file contains the code relating the chunk abstraction. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* This file is mostly in anticipation of future work, but initially * populate with fragment tracking for an outbound message. */ /* Initialize datamsg from memory. */ static void sctp_datamsg_init(struct sctp_datamsg *msg) { refcount_set(&msg->refcnt, 1); msg->send_failed = 0; msg->send_error = 0; msg->can_delay = 1; msg->abandoned = 0; msg->expires_at = 0; INIT_LIST_HEAD(&msg->chunks); } /* Allocate and initialize datamsg. */ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) { struct sctp_datamsg *msg; msg = kmalloc(sizeof(struct sctp_datamsg), gfp); if (msg) { sctp_datamsg_init(msg); SCTP_DBG_OBJCNT_INC(datamsg); } return msg; } void sctp_datamsg_free(struct sctp_datamsg *msg) { struct sctp_chunk *chunk; /* This doesn't have to be a _safe vairant because * sctp_chunk_free() only drops the refs. */ list_for_each_entry(chunk, &msg->chunks, frag_list) sctp_chunk_free(chunk); sctp_datamsg_put(msg); } /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { struct sctp_association *asoc = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_ulpevent *ev; int error, sent; /* Release all references. */ list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); if (!msg->send_failed) { sctp_chunk_put(chunk); continue; } asoc = chunk->asoc; error = msg->send_error ?: asoc->outqueue.error; sent = chunk->has_tsn ? SCTP_DATA_SENT : SCTP_DATA_UNSENT; if (sctp_ulpevent_type_enabled(asoc->subscribe, SCTP_SEND_FAILED)) { ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } if (sctp_ulpevent_type_enabled(asoc->subscribe, SCTP_SEND_FAILED_EVENT)) { ev = sctp_ulpevent_make_send_failed_event(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_chunk_put(chunk); } SCTP_DBG_OBJCNT_DEC(datamsg); kfree(msg); } /* Hold a reference. */ static void sctp_datamsg_hold(struct sctp_datamsg *msg) { refcount_inc(&msg->refcnt); } /* Release a reference. */ void sctp_datamsg_put(struct sctp_datamsg *msg) { if (refcount_dec_and_test(&msg->refcnt)) sctp_datamsg_destroy(msg); } /* Assign a chunk to this datamsg. */ static void sctp_datamsg_assign(struct sctp_datamsg *msg, struct sctp_chunk *chunk) { sctp_datamsg_hold(msg); chunk->msg = msg; } /* A data chunk can have a maximum payload of (2^16 - 20). Break * down any such message into smaller chunks. Opportunistically, fragment * the chunks down to the current MTU constraints. We may get refragmented * later if the PMTU changes, but it is _much better_ to fragment immediately * with a reasonable guess than always doing our fragmentation on the * soft-interrupt. */ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct iov_iter *from) { size_t len, first_len, max_data, remaining; size_t msg_len = iov_iter_count(from); struct sctp_shared_key *shkey = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_datamsg *msg; int err; msg = sctp_datamsg_new(GFP_KERNEL); if (!msg) return ERR_PTR(-ENOMEM); /* Note: Calculate this outside of the loop, so that all fragments * have the same expiration. */ if (asoc->peer.prsctp_capable && sinfo->sinfo_timetolive && (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) || !SCTP_PR_POLICY(sinfo->sinfo_flags))) msg->expires_at = jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); /* This is the biggest possible DATA chunk that can fit into * the packet */ max_data = asoc->frag_point; if (unlikely(!max_data)) { max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk), sctp_datachk_len(&asoc->stream)); pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%zu)", __func__, asoc, max_data); } /* If the peer requested that we authenticate DATA chunks * we need to account for bundling of the AUTH chunks along with * DATA. */ if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) { struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + hmac_desc->hmac_len); if (sinfo->sinfo_tsn && sinfo->sinfo_ssn != asoc->active_key_id) { shkey = sctp_auth_get_shkey(asoc, sinfo->sinfo_ssn); if (!shkey) { err = -EINVAL; goto errout; } } else { shkey = asoc->shkey; } } /* Set first_len and then account for possible bundles on first frag */ first_len = max_data; /* Check to see if we have a pending SACK and try to let it be bundled * with this message. Do this if we don't have any data queued already. * To check that, look at out_qlen and retransmit list. * NOTE: we will not reduce to account for SACK, if the message would * not have been fragmented. */ if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) && asoc->outqueue.out_qlen == 0 && list_empty(&asoc->outqueue.retransmit) && msg_len > max_data) first_len -= SCTP_PAD4(sizeof(struct sctp_sack_chunk)); /* Encourage Cookie-ECHO bundling. */ if (asoc->state < SCTP_STATE_COOKIE_ECHOED) first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN; /* Account for a different sized first fragment */ if (msg_len >= first_len) { msg->can_delay = 0; if (msg_len > first_len) SCTP_INC_STATS(asoc->base.net, SCTP_MIB_FRAGUSRMSGS); } else { /* Which may be the only one... */ first_len = msg_len; } /* Create chunks for all DATA chunks. */ for (remaining = msg_len; remaining; remaining -= len) { u8 frag = SCTP_DATA_MIDDLE_FRAG; if (remaining == msg_len) { /* First frag, which may also be the last */ frag |= SCTP_DATA_FIRST_FRAG; len = first_len; } else { /* Middle frags */ len = max_data; } if (len >= remaining) { /* Last frag, which may also be the first */ len = remaining; frag |= SCTP_DATA_LAST_FRAG; /* The application requests to set the I-bit of the * last DATA chunk of a user message when providing * the user message to the SCTP implementation. */ if ((sinfo->sinfo_flags & SCTP_EOF) || (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY)) frag |= SCTP_DATA_SACK_IMM; } chunk = asoc->stream.si->make_datafrag(asoc, sinfo, len, frag, GFP_KERNEL); if (!chunk) { err = -ENOMEM; goto errout; } err = sctp_user_addto_chunk(chunk, len, from); if (err < 0) goto errout_chunk_free; chunk->shkey = shkey; /* Put the chunk->skb back into the form expected by send. */ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - chunk->skb->data); sctp_datamsg_assign(msg, chunk); list_add_tail(&chunk->frag_list, &msg->chunks); } return msg; errout_chunk_free: sctp_chunk_free(chunk); errout: list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); sctp_chunk_free(chunk); } sctp_datamsg_put(msg); return ERR_PTR(err); } /* Check whether this message has expired. */ int sctp_chunk_abandoned(struct sctp_chunk *chunk) { if (!chunk->asoc->peer.prsctp_capable) return 0; if (chunk->msg->abandoned) return 1; if (!chunk->has_tsn && !(chunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)) return 0; if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = SCTP_SO(&chunk->asoc->stream, chunk->sinfo.sinfo_stream); if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(TTL)]++; } else { chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; } chunk->msg->abandoned = 1; return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { struct sctp_stream_out *streamout = SCTP_SO(&chunk->asoc->stream, chunk->sinfo.sinfo_stream); chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; chunk->msg->abandoned = 1; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && time_after(jiffies, chunk->msg->expires_at)) { chunk->msg->abandoned = 1; return 1; } /* PRIO policy is processed by sendmsg, not here */ return 0; } /* This chunk (and consequently entire message) has failed in its sending. */ void sctp_chunk_fail(struct sctp_chunk *chunk, int error) { chunk->msg->send_failed = 1; chunk->msg->send_error = error; }
9710 9723 9807 10630 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_CONTEXT_H #define __X86_KERNEL_FPU_CONTEXT_H #include <asm/fpu/xstate.h> #include <asm/trace/fpu.h> /* Functions related to FPU context tracking */ /* * The in-register FPU state for an FPU context on a CPU is assumed to be * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx * matches the FPU. * * If the FPU register state is valid, the kernel can skip restoring the * FPU state from memory. * * Any code that clobbers the FPU registers or updates the in-memory * FPU state for a task MUST let the rest of the kernel know that the * FPU registers are no longer valid for this task. * * Invalidate a resource you control: CPU if using the CPU for something else * (with preemption disabled), FPU for the current task, or a task that * is prevented from running by the current task. */ static inline void __cpu_invalidate_fpregs_state(void) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) { fpu->last_cpu = -1; } static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } static inline void fpregs_deactivate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } /* Internal helper for switch_fpu_return() and signal frame setup */ static inline void fpregs_restore_userregs(void) { struct fpu *fpu = &current->thread.fpu; int cpu = smp_processor_id(); if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) return; if (!fpregs_state_valid(fpu, cpu)) { /* * This restores _all_ xstate which has not been * established yet. * * If PKRU is enabled, then the PKRU value is already * correct because it was either set in switch_to() or in * flush_thread(). So it is excluded because it might be * not up to date in current->thread.fpu.xsave state. * * XFD state is handled in restore_fpregs_from_fpstate(). */ restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); fpregs_activate(fpu); fpu->last_cpu = cpu; } clear_thread_flag(TIF_NEED_FPU_LOAD); } #endif
2480 2478 2480 2479 2480 2479 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * linux/drivers/char/serial_core.h * * Copyright (C) 2000 Deep Blue Solutions Ltd. */ #ifndef LINUX_SERIAL_CORE_H #define LINUX_SERIAL_CORE_H #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/console.h> #include <linux/interrupt.h> #include <linux/lockdep.h> #include <linux/printk.h> #include <linux/spinlock.h> #include <linux/sched.h> #include <linux/tty.h> #include <linux/mutex.h> #include <linux/sysrq.h> #include <uapi/linux/serial_core.h> #ifdef CONFIG_SERIAL_CORE_CONSOLE #define uart_console(port) \ ((port)->cons && (port)->cons->index == (port)->line) #else #define uart_console(port) ({ (void)port; 0; }) #endif struct uart_port; struct serial_struct; struct serial_port_device; struct device; struct gpio_desc; /** * struct uart_ops -- interface between serial_core and the driver * * This structure describes all the operations that can be done on the * physical hardware. * * @tx_empty: ``unsigned int ()(struct uart_port *port)`` * * This function tests whether the transmitter fifo and shifter for the * @port is empty. If it is empty, this function should return * %TIOCSER_TEMT, otherwise return 0. If the port does not support this * operation, then it should return %TIOCSER_TEMT. * * Locking: none. * Interrupts: caller dependent. * This call must not sleep * * @set_mctrl: ``void ()(struct uart_port *port, unsigned int mctrl)`` * * This function sets the modem control lines for @port to the state * described by @mctrl. The relevant bits of @mctrl are: * * - %TIOCM_RTS RTS signal. * - %TIOCM_DTR DTR signal. * - %TIOCM_OUT1 OUT1 signal. * - %TIOCM_OUT2 OUT2 signal. * - %TIOCM_LOOP Set the port into loopback mode. * * If the appropriate bit is set, the signal should be driven * active. If the bit is clear, the signal should be driven * inactive. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @get_mctrl: ``unsigned int ()(struct uart_port *port)`` * * Returns the current state of modem control inputs of @port. The state * of the outputs should not be returned, since the core keeps track of * their state. The state information should include: * * - %TIOCM_CAR state of DCD signal * - %TIOCM_CTS state of CTS signal * - %TIOCM_DSR state of DSR signal * - %TIOCM_RI state of RI signal * * The bit is set if the signal is currently driven active. If * the port does not support CTS, DCD or DSR, the driver should * indicate that the signal is permanently active. If RI is * not available, the signal should not be indicated as active. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @stop_tx: ``void ()(struct uart_port *port)`` * * Stop transmitting characters. This might be due to the CTS line * becoming inactive or the tty layer indicating we want to stop * transmission due to an %XOFF character. * * The driver should stop transmitting characters as soon as possible. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @start_tx: ``void ()(struct uart_port *port)`` * * Start transmitting characters. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @throttle: ``void ()(struct uart_port *port)`` * * Notify the serial driver that input buffers for the line discipline are * close to full, and it should somehow signal that no more characters * should be sent to the serial port. * This will be called only if hardware assisted flow control is enabled. * * Locking: serialized with @unthrottle() and termios modification by the * tty layer. * * @unthrottle: ``void ()(struct uart_port *port)`` * * Notify the serial driver that characters can now be sent to the serial * port without fear of overrunning the input buffers of the line * disciplines. * * This will be called only if hardware assisted flow control is enabled. * * Locking: serialized with @throttle() and termios modification by the * tty layer. * * @send_xchar: ``void ()(struct uart_port *port, char ch)`` * * Transmit a high priority character, even if the port is stopped. This * is used to implement XON/XOFF flow control and tcflow(). If the serial * driver does not implement this function, the tty core will append the * character to the circular buffer and then call start_tx() / stop_tx() * to flush the data out. * * Do not transmit if @ch == '\0' (%__DISABLED_CHAR). * * Locking: none. * Interrupts: caller dependent. * * @start_rx: ``void ()(struct uart_port *port)`` * * Start receiving characters. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @stop_rx: ``void ()(struct uart_port *port)`` * * Stop receiving characters; the @port is in the process of being closed. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @enable_ms: ``void ()(struct uart_port *port)`` * * Enable the modem status interrupts. * * This method may be called multiple times. Modem status interrupts * should be disabled when the @shutdown() method is called. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @break_ctl: ``void ()(struct uart_port *port, int ctl)`` * * Control the transmission of a break signal. If @ctl is nonzero, the * break signal should be transmitted. The signal should be terminated * when another call is made with a zero @ctl. * * Locking: caller holds tty_port->mutex * * @startup: ``int ()(struct uart_port *port)`` * * Grab any interrupt resources and initialise any low level driver state. * Enable the port for reception. It should not activate RTS nor DTR; * this will be done via a separate call to @set_mctrl(). * * This method will only be called when the port is initially opened. * * Locking: port_sem taken. * Interrupts: globally disabled. * * @shutdown: ``void ()(struct uart_port *port)`` * * Disable the @port, disable any break condition that may be in effect, * and free any interrupt resources. It should not disable RTS nor DTR; * this will have already been done via a separate call to @set_mctrl(). * * Drivers must not access @port->state once this call has completed. * * This method will only be called when there are no more users of this * @port. * * Locking: port_sem taken. * Interrupts: caller dependent. * * @flush_buffer: ``void ()(struct uart_port *port)`` * * Flush any write buffers, reset any DMA state and stop any ongoing DMA * transfers. * * This will be called whenever the @port->state->xmit circular buffer is * cleared. * * Locking: @port->lock taken. * Interrupts: locally disabled. * This call must not sleep * * @set_termios: ``void ()(struct uart_port *port, struct ktermios *new, * struct ktermios *old)`` * * Change the @port parameters, including word length, parity, stop bits. * Update @port->read_status_mask and @port->ignore_status_mask to * indicate the types of events we are interested in receiving. Relevant * ktermios::c_cflag bits are: * * - %CSIZE - word size * - %CSTOPB - 2 stop bits * - %PARENB - parity enable * - %PARODD - odd parity (when %PARENB is in force) * - %ADDRB - address bit (changed through uart_port::rs485_config()). * - %CREAD - enable reception of characters (if not set, still receive * characters from the port, but throw them away). * - %CRTSCTS - if set, enable CTS status change reporting. * - %CLOCAL - if not set, enable modem status change reporting. * * Relevant ktermios::c_iflag bits are: * * - %INPCK - enable frame and parity error events to be passed to the TTY * layer. * - %BRKINT / %PARMRK - both of these enable break events to be passed to * the TTY layer. * - %IGNPAR - ignore parity and framing errors. * - %IGNBRK - ignore break errors. If %IGNPAR is also set, ignore overrun * errors as well. * * The interaction of the ktermios::c_iflag bits is as follows (parity * error given as an example): * * ============ ======= ======= ========================================= * Parity error INPCK IGNPAR * ============ ======= ======= ========================================= * n/a 0 n/a character received, marked as %TTY_NORMAL * None 1 n/a character received, marked as %TTY_NORMAL * Yes 1 0 character received, marked as %TTY_PARITY * Yes 1 1 character discarded * ============ ======= ======= ========================================= * * Other flags may be used (eg, xon/xoff characters) if your hardware * supports hardware "soft" flow control. * * Locking: caller holds tty_port->mutex * Interrupts: caller dependent. * This call must not sleep * * @set_ldisc: ``void ()(struct uart_port *port, struct ktermios *termios)`` * * Notifier for discipline change. See * Documentation/driver-api/tty/tty_ldisc.rst. * * Locking: caller holds tty_port->mutex * * @pm: ``void ()(struct uart_port *port, unsigned int state, * unsigned int oldstate)`` * * Perform any power management related activities on the specified @port. * @state indicates the new state (defined by enum uart_pm_state), * @oldstate indicates the previous state. * * This function should not be used to grab any resources. * * This will be called when the @port is initially opened and finally * closed, except when the @port is also the system console. This will * occur even if %CONFIG_PM is not set. * * Locking: none. * Interrupts: caller dependent. * * @type: ``const char *()(struct uart_port *port)`` * * Return a pointer to a string constant describing the specified @port, * or return %NULL, in which case the string 'unknown' is substituted. * * Locking: none. * Interrupts: caller dependent. * * @release_port: ``void ()(struct uart_port *port)`` * * Release any memory and IO region resources currently in use by the * @port. * * Locking: none. * Interrupts: caller dependent. * * @request_port: ``int ()(struct uart_port *port)`` * * Request any memory and IO region resources required by the port. If any * fail, no resources should be registered when this function returns, and * it should return -%EBUSY on failure. * * Locking: none. * Interrupts: caller dependent. * * @config_port: ``void ()(struct uart_port *port, int type)`` * * Perform any autoconfiguration steps required for the @port. @type * contains a bit mask of the required configuration. %UART_CONFIG_TYPE * indicates that the port requires detection and identification. * @port->type should be set to the type found, or %PORT_UNKNOWN if no * port was detected. * * %UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal, * which should be probed using standard kernel autoprobing techniques. * This is not necessary on platforms where ports have interrupts * internally hard wired (eg, system on a chip implementations). * * Locking: none. * Interrupts: caller dependent. * * @verify_port: ``int ()(struct uart_port *port, * struct serial_struct *serinfo)`` * * Verify the new serial port information contained within @serinfo is * suitable for this port type. * * Locking: none. * Interrupts: caller dependent. * * @ioctl: ``int ()(struct uart_port *port, unsigned int cmd, * unsigned long arg)`` * * Perform any port specific IOCTLs. IOCTL commands must be defined using * the standard numbering system found in <asm/ioctl.h>. * * Locking: none. * Interrupts: caller dependent. * * @poll_init: ``int ()(struct uart_port *port)`` * * Called by kgdb to perform the minimal hardware initialization needed to * support @poll_put_char() and @poll_get_char(). Unlike @startup(), this * should not request interrupts. * * Locking: %tty_mutex and tty_port->mutex taken. * Interrupts: n/a. * * @poll_put_char: ``void ()(struct uart_port *port, unsigned char ch)`` * * Called by kgdb to write a single character @ch directly to the serial * @port. It can and should block until there is space in the TX FIFO. * * Locking: none. * Interrupts: caller dependent. * This call must not sleep * * @poll_get_char: ``int ()(struct uart_port *port)`` * * Called by kgdb to read a single character directly from the serial * port. If data is available, it should be returned; otherwise the * function should return %NO_POLL_CHAR immediately. * * Locking: none. * Interrupts: caller dependent. * This call must not sleep */ struct uart_ops { unsigned int (*tx_empty)(struct uart_port *); void (*set_mctrl)(struct uart_port *, unsigned int mctrl); unsigned int (*get_mctrl)(struct uart_port *); void (*stop_tx)(struct uart_port *); void (*start_tx)(struct uart_port *); void (*throttle)(struct uart_port *); void (*unthrottle)(struct uart_port *); void (*send_xchar)(struct uart_port *, char ch); void (*stop_rx)(struct uart_port *); void (*start_rx)(struct uart_port *); void (*enable_ms)(struct uart_port *); void (*break_ctl)(struct uart_port *, int ctl); int (*startup)(struct uart_port *); void (*shutdown)(struct uart_port *); void (*flush_buffer)(struct uart_port *); void (*set_termios)(struct uart_port *, struct ktermios *new, const struct ktermios *old); void (*set_ldisc)(struct uart_port *, struct ktermios *); void (*pm)(struct uart_port *, unsigned int state, unsigned int oldstate); const char *(*type)(struct uart_port *); void (*release_port)(struct uart_port *); int (*request_port)(struct uart_port *); void (*config_port)(struct uart_port *, int); int (*verify_port)(struct uart_port *, struct serial_struct *); int (*ioctl)(struct uart_port *, unsigned int, unsigned long); #ifdef CONFIG_CONSOLE_POLL int (*poll_init)(struct uart_port *); void (*poll_put_char)(struct uart_port *, unsigned char); int (*poll_get_char)(struct uart_port *); #endif }; #define NO_POLL_CHAR 0x00ff0000 #define UART_CONFIG_TYPE (1 << 0) #define UART_CONFIG_IRQ (1 << 1) struct uart_icount { __u32 cts; __u32 dsr; __u32 rng; __u32 dcd; __u32 rx; __u32 tx; __u32 frame; __u32 overrun; __u32 parity; __u32 brk; __u32 buf_overrun; }; typedef u64 __bitwise upf_t; typedef unsigned int __bitwise upstat_t; struct uart_port { spinlock_t lock; /* port lock */ unsigned long iobase; /* in/out[bwl] */ unsigned char __iomem *membase; /* read/write[bwl] */ unsigned int (*serial_in)(struct uart_port *, int); void (*serial_out)(struct uart_port *, int, int); void (*set_termios)(struct uart_port *, struct ktermios *new, const struct ktermios *old); void (*set_ldisc)(struct uart_port *, struct ktermios *); unsigned int (*get_mctrl)(struct uart_port *); void (*set_mctrl)(struct uart_port *, unsigned int); unsigned int (*get_divisor)(struct uart_port *, unsigned int baud, unsigned int *frac); void (*set_divisor)(struct uart_port *, unsigned int baud, unsigned int quot, unsigned int quot_frac); int (*startup)(struct uart_port *port); void (*shutdown)(struct uart_port *port); void (*throttle)(struct uart_port *port); void (*unthrottle)(struct uart_port *port); int (*handle_irq)(struct uart_port *); void (*pm)(struct uart_port *, unsigned int state, unsigned int old); void (*handle_break)(struct uart_port *); int (*rs485_config)(struct uart_port *, struct ktermios *termios, struct serial_rs485 *rs485); int (*iso7816_config)(struct uart_port *, struct serial_iso7816 *iso7816); unsigned int ctrl_id; /* optional serial core controller id */ unsigned int port_id; /* optional serial core port id */ unsigned int irq; /* irq number */ unsigned long irqflags; /* irq flags */ unsigned int uartclk; /* base uart clock */ unsigned int fifosize; /* tx fifo size */ unsigned char x_char; /* xon/xoff char */ unsigned char regshift; /* reg offset shift */ unsigned char iotype; /* io access style */ #define UPIO_UNKNOWN ((unsigned char)~0U) /* UCHAR_MAX */ #define UPIO_PORT (SERIAL_IO_PORT) /* 8b I/O port access */ #define UPIO_HUB6 (SERIAL_IO_HUB6) /* Hub6 ISA card */ #define UPIO_MEM (SERIAL_IO_MEM) /* driver-specific */ #define UPIO_MEM32 (SERIAL_IO_MEM32) /* 32b little endian */ #define UPIO_AU (SERIAL_IO_AU) /* Au1x00 and RT288x type IO */ #define UPIO_TSI (SERIAL_IO_TSI) /* Tsi108/109 type IO */ #define UPIO_MEM32BE (SERIAL_IO_MEM32BE) /* 32b big endian */ #define UPIO_MEM16 (SERIAL_IO_MEM16) /* 16b little endian */ unsigned char quirks; /* internal quirks */ /* internal quirks must be updated while holding port mutex */ #define UPQ_NO_TXEN_TEST BIT(0) unsigned int read_status_mask; /* driver specific */ unsigned int ignore_status_mask; /* driver specific */ struct uart_state *state; /* pointer to parent state */ struct uart_icount icount; /* statistics */ struct console *cons; /* struct console, if any */ /* flags must be updated while holding port mutex */ upf_t flags; /* * These flags must be equivalent to the flags defined in * include/uapi/linux/tty_flags.h which are the userspace definitions * assigned from the serial_struct flags in uart_set_info() * [for bit definitions in the UPF_CHANGE_MASK] * * Bits [0..ASYNCB_LAST_USER] are userspace defined/visible/changeable * The remaining bits are serial-core specific and not modifiable by * userspace. */ #ifdef CONFIG_HAS_IOPORT #define UPF_FOURPORT ((__force upf_t) ASYNC_FOURPORT /* 1 */ ) #else #define UPF_FOURPORT 0 #endif #define UPF_SAK ((__force upf_t) ASYNC_SAK /* 2 */ ) #define UPF_SPD_HI ((__force upf_t) ASYNC_SPD_HI /* 4 */ ) #define UPF_SPD_VHI ((__force upf_t) ASYNC_SPD_VHI /* 5 */ ) #define UPF_SPD_CUST ((__force upf_t) ASYNC_SPD_CUST /* 0x0030 */ ) #define UPF_SPD_WARP ((__force upf_t) ASYNC_SPD_WARP /* 0x1010 */ ) #define UPF_SPD_MASK ((__force upf_t) ASYNC_SPD_MASK /* 0x1030 */ ) #define UPF_SKIP_TEST ((__force upf_t) ASYNC_SKIP_TEST /* 6 */ ) #define UPF_AUTO_IRQ ((__force upf_t) ASYNC_AUTO_IRQ /* 7 */ ) #define UPF_HARDPPS_CD ((__force upf_t) ASYNC_HARDPPS_CD /* 11 */ ) #define UPF_SPD_SHI ((__force upf_t) ASYNC_SPD_SHI /* 12 */ ) #define UPF_LOW_LATENCY ((__force upf_t) ASYNC_LOW_LATENCY /* 13 */ ) #define UPF_BUGGY_UART ((__force upf_t) ASYNC_BUGGY_UART /* 14 */ ) #define UPF_MAGIC_MULTIPLIER ((__force upf_t) ASYNC_MAGIC_MULTIPLIER /* 16 */ ) #define UPF_NO_THRE_TEST ((__force upf_t) BIT_ULL(19)) /* Port has hardware-assisted h/w flow control */ #define UPF_AUTO_CTS ((__force upf_t) BIT_ULL(20)) #define UPF_AUTO_RTS ((__force upf_t) BIT_ULL(21)) #define UPF_HARD_FLOW ((__force upf_t) (UPF_AUTO_CTS | UPF_AUTO_RTS)) /* Port has hardware-assisted s/w flow control */ #define UPF_SOFT_FLOW ((__force upf_t) BIT_ULL(22)) #define UPF_CONS_FLOW ((__force upf_t) BIT_ULL(23)) #define UPF_SHARE_IRQ ((__force upf_t) BIT_ULL(24)) #define UPF_EXAR_EFR ((__force upf_t) BIT_ULL(25)) #define UPF_BUG_THRE ((__force upf_t) BIT_ULL(26)) /* The exact UART type is known and should not be probed. */ #define UPF_FIXED_TYPE ((__force upf_t) BIT_ULL(27)) #define UPF_BOOT_AUTOCONF ((__force upf_t) BIT_ULL(28)) #define UPF_FIXED_PORT ((__force upf_t) BIT_ULL(29)) #define UPF_DEAD ((__force upf_t) BIT_ULL(30)) #define UPF_IOREMAP ((__force upf_t) BIT_ULL(31)) #define UPF_FULL_PROBE ((__force upf_t) BIT_ULL(32)) #define __UPF_CHANGE_MASK 0x17fff #define UPF_CHANGE_MASK ((__force upf_t) __UPF_CHANGE_MASK) #define UPF_USR_MASK ((__force upf_t) (UPF_SPD_MASK|UPF_LOW_LATENCY)) #if __UPF_CHANGE_MASK > ASYNC_FLAGS #error Change mask not equivalent to userspace-visible bit defines #endif /* * Must hold termios_rwsem, port mutex and port lock to change; * can hold any one lock to read. */ upstat_t status; #define UPSTAT_CTS_ENABLE ((__force upstat_t) (1 << 0)) #define UPSTAT_DCD_ENABLE ((__force upstat_t) (1 << 1)) #define UPSTAT_AUTORTS ((__force upstat_t) (1 << 2)) #define UPSTAT_AUTOCTS ((__force upstat_t) (1 << 3)) #define UPSTAT_AUTOXOFF ((__force upstat_t) (1 << 4)) #define UPSTAT_SYNC_FIFO ((__force upstat_t) (1 << 5)) bool hw_stopped; /* sw-assisted CTS flow state */ unsigned int mctrl; /* current modem ctrl settings */ unsigned int frame_time; /* frame timing in ns */ unsigned int type; /* port type */ const struct uart_ops *ops; unsigned int custom_divisor; unsigned int line; /* port index */ unsigned int minor; resource_size_t mapbase; /* for ioremap */ resource_size_t mapsize; struct device *dev; /* serial port physical parent device */ struct serial_port_device *port_dev; /* serial core port device */ unsigned long sysrq; /* sysrq timeout */ u8 sysrq_ch; /* char for sysrq */ unsigned char has_sysrq; unsigned char sysrq_seq; /* index in sysrq_toggle_seq */ unsigned char hub6; /* this should be in the 8250 driver */ unsigned char suspended; unsigned char console_reinit; const char *name; /* port name */ struct attribute_group *attr_group; /* port specific attributes */ const struct attribute_group **tty_groups; /* all attributes (serial core use only) */ struct serial_rs485 rs485; struct serial_rs485 rs485_supported; /* Supported mask for serial_rs485 */ struct gpio_desc *rs485_term_gpio; /* enable RS485 bus termination */ struct gpio_desc *rs485_rx_during_tx_gpio; /* Output GPIO that sets the state of RS485 RX during TX */ struct serial_iso7816 iso7816; void *private_data; /* generic platform data pointer */ }; /* * Only for console->device_lock()/_unlock() callbacks and internal * port lock wrapper synchronization. */ static inline void __uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags) { spin_lock_irqsave(&up->lock, *flags); } /* * Only for console->device_lock()/_unlock() callbacks and internal * port lock wrapper synchronization. */ static inline void __uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags) { spin_unlock_irqrestore(&up->lock, flags); } /** * uart_port_set_cons - Safely set the @cons field for a uart * @up: The uart port to set * @con: The new console to set to * * This function must be used to set @up->cons. It uses the port lock to * synchronize with the port lock wrappers in order to ensure that the console * cannot change or disappear while another context is holding the port lock. */ static inline void uart_port_set_cons(struct uart_port *up, struct console *con) { unsigned long flags; __uart_port_lock_irqsave(up, &flags); up->cons = con; __uart_port_unlock_irqrestore(up, flags); } /* Only for internal port lock wrapper usage. */ static inline bool __uart_port_using_nbcon(struct uart_port *up) { lockdep_assert_held_once(&up->lock); if (likely(!uart_console(up))) return false; /* * @up->cons is only modified under the port lock. Therefore it is * certain that it cannot disappear here. * * @up->cons->node is added/removed from the console list under the * port lock. Therefore it is certain that the registration status * cannot change here, thus @up->cons->flags can be read directly. */ if (hlist_unhashed_lockless(&up->cons->node) || !(up->cons->flags & CON_NBCON) || !up->cons->write_atomic) { return false; } return true; } /* Only for internal port lock wrapper usage. */ static inline bool __uart_port_nbcon_try_acquire(struct uart_port *up) { if (!__uart_port_using_nbcon(up)) return true; return nbcon_device_try_acquire(up->cons); } /* Only for internal port lock wrapper usage. */ static inline void __uart_port_nbcon_acquire(struct uart_port *up) { if (!__uart_port_using_nbcon(up)) return; while (!nbcon_device_try_acquire(up->cons)) cpu_relax(); } /* Only for internal port lock wrapper usage. */ static inline void __uart_port_nbcon_release(struct uart_port *up) { if (!__uart_port_using_nbcon(up)) return; nbcon_device_release(up->cons); } /** * uart_port_lock - Lock the UART port * @up: Pointer to UART port structure */ static inline void uart_port_lock(struct uart_port *up) { spin_lock(&up->lock); __uart_port_nbcon_acquire(up); } /** * uart_port_lock_irq - Lock the UART port and disable interrupts * @up: Pointer to UART port structure */ static inline void uart_port_lock_irq(struct uart_port *up) { spin_lock_irq(&up->lock); __uart_port_nbcon_acquire(up); } /** * uart_port_lock_irqsave - Lock the UART port, save and disable interrupts * @up: Pointer to UART port structure * @flags: Pointer to interrupt flags storage */ static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags) { spin_lock_irqsave(&up->lock, *flags); __uart_port_nbcon_acquire(up); } /** * uart_port_trylock - Try to lock the UART port * @up: Pointer to UART port structure * * Returns: True if lock was acquired, false otherwise */ static inline bool uart_port_trylock(struct uart_port *up) { if (!spin_trylock(&up->lock)) return false; if (!__uart_port_nbcon_try_acquire(up)) { spin_unlock(&up->lock); return false; } return true; } /** * uart_port_trylock_irqsave - Try to lock the UART port, save and disable interrupts * @up: Pointer to UART port structure * @flags: Pointer to interrupt flags storage * * Returns: True if lock was acquired, false otherwise */ static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags) { if (!spin_trylock_irqsave(&up->lock, *flags)) return false; if (!__uart_port_nbcon_try_acquire(up)) { spin_unlock_irqrestore(&up->lock, *flags); return false; } return true; } /** * uart_port_unlock - Unlock the UART port * @up: Pointer to UART port structure */ static inline void uart_port_unlock(struct uart_port *up) { __uart_port_nbcon_release(up); spin_unlock(&up->lock); } /** * uart_port_unlock_irq - Unlock the UART port and re-enable interrupts * @up: Pointer to UART port structure */ static inline void uart_port_unlock_irq(struct uart_port *up) { __uart_port_nbcon_release(up); spin_unlock_irq(&up->lock); } /** * uart_port_unlock_irqrestore - Unlock the UART port, restore interrupts * @up: Pointer to UART port structure * @flags: The saved interrupt flags for restore */ static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags) { __uart_port_nbcon_release(up); spin_unlock_irqrestore(&up->lock, flags); } static inline int serial_port_in(struct uart_port *up, int offset) { return up->serial_in(up, offset); } static inline void serial_port_out(struct uart_port *up, int offset, int value) { up->serial_out(up, offset, value); } /** * enum uart_pm_state - power states for UARTs * @UART_PM_STATE_ON: UART is powered, up and operational * @UART_PM_STATE_OFF: UART is powered off * @UART_PM_STATE_UNDEFINED: sentinel */ enum uart_pm_state { UART_PM_STATE_ON = 0, UART_PM_STATE_OFF = 3, /* number taken from ACPI */ UART_PM_STATE_UNDEFINED, }; /* * This is the state information which is persistent across opens. */ struct uart_state { struct tty_port port; enum uart_pm_state pm_state; atomic_t refcount; wait_queue_head_t remove_wait; struct uart_port *uart_port; }; #define UART_XMIT_SIZE PAGE_SIZE /* number of characters left in xmit buffer before we ask for more */ #define WAKEUP_CHARS 256 /** * uart_xmit_advance - Advance xmit buffer and account Tx'ed chars * @up: uart_port structure describing the port * @chars: number of characters sent * * This function advances the tail of circular xmit buffer by the number of * @chars transmitted and handles accounting of transmitted bytes (into * @up's icount.tx). */ static inline void uart_xmit_advance(struct uart_port *up, unsigned int chars) { struct tty_port *tport = &up->state->port; kfifo_skip_count(&tport->xmit_fifo, chars); up->icount.tx += chars; } static inline unsigned int uart_fifo_out(struct uart_port *up, unsigned char *buf, unsigned int chars) { struct tty_port *tport = &up->state->port; chars = kfifo_out(&tport->xmit_fifo, buf, chars); up->icount.tx += chars; return chars; } static inline unsigned int uart_fifo_get(struct uart_port *up, unsigned char *ch) { struct tty_port *tport = &up->state->port; unsigned int chars; chars = kfifo_get(&tport->xmit_fifo, ch); up->icount.tx += chars; return chars; } struct module; struct tty_driver; struct uart_driver { struct module *owner; const char *driver_name; const char *dev_name; int major; int minor; int nr; struct console *cons; /* * these are private; the low level driver should not * touch these; they should be initialised to NULL */ struct uart_state *state; struct tty_driver *tty_driver; }; void uart_write_wakeup(struct uart_port *port); /** * enum UART_TX_FLAGS -- flags for uart_port_tx_flags() * * @UART_TX_NOSTOP: don't call port->ops->stop_tx() on empty buffer */ enum UART_TX_FLAGS { UART_TX_NOSTOP = BIT(0), }; #define __uart_port_tx(uport, ch, flags, tx_ready, put_char, tx_done, \ for_test, for_post) \ ({ \ struct uart_port *__port = (uport); \ struct tty_port *__tport = &__port->state->port; \ unsigned int pending; \ \ for (; (for_test) && (tx_ready); (for_post), __port->icount.tx++) { \ if (__port->x_char) { \ (ch) = __port->x_char; \ (put_char); \ __port->x_char = 0; \ continue; \ } \ \ if (uart_tx_stopped(__port)) \ break; \ \ if (!kfifo_get(&__tport->xmit_fifo, &(ch))) \ break; \ \ (put_char); \ } \ \ (tx_done); \ \ pending = kfifo_len(&__tport->xmit_fifo); \ if (pending < WAKEUP_CHARS) { \ uart_write_wakeup(__port); \ \ if (!((flags) & UART_TX_NOSTOP) && pending == 0) \ __port->ops->stop_tx(__port); \ } \ \ pending; \ }) /** * uart_port_tx_limited -- transmit helper for uart_port with count limiting * @port: uart port * @ch: variable to store a character to be written to the HW * @count: a limit of characters to send * @tx_ready: can HW accept more data function * @put_char: function to write a character * @tx_done: function to call after the loop is done * * This helper transmits characters from the xmit buffer to the hardware using * @put_char(). It does so until @count characters are sent and while @tx_ready * evaluates to true. * * Returns: the number of characters in the xmit buffer when done. * * The expression in macro parameters shall be designed as follows: * * **tx_ready:** should evaluate to true if the HW can accept more data to * be sent. This parameter can be %true, which means the HW is always ready. * * **put_char:** shall write @ch to the device of @port. * * **tx_done:** when the write loop is done, this can perform arbitrary * action before potential invocation of ops->stop_tx() happens. If the * driver does not need to do anything, use e.g. ({}). * * For all of them, @port->lock is held, interrupts are locally disabled and * the expressions must not sleep. */ #define uart_port_tx_limited(port, ch, count, tx_ready, put_char, tx_done) ({ \ unsigned int __count = (count); \ __uart_port_tx(port, ch, 0, tx_ready, put_char, tx_done, __count, \ __count--); \ }) /** * uart_port_tx_limited_flags -- transmit helper for uart_port with count limiting with flags * @port: uart port * @ch: variable to store a character to be written to the HW * @flags: %UART_TX_NOSTOP or similar * @count: a limit of characters to send * @tx_ready: can HW accept more data function * @put_char: function to write a character * @tx_done: function to call after the loop is done * * See uart_port_tx_limited() for more details. */ #define uart_port_tx_limited_flags(port, ch, flags, count, tx_ready, put_char, tx_done) ({ \ unsigned int __count = (count); \ __uart_port_tx(port, ch, flags, tx_ready, put_char, tx_done, __count, \ __count--); \ }) /** * uart_port_tx -- transmit helper for uart_port * @port: uart port * @ch: variable to store a character to be written to the HW * @tx_ready: can HW accept more data function * @put_char: function to write a character * * See uart_port_tx_limited() for more details. */ #define uart_port_tx(port, ch, tx_ready, put_char) \ __uart_port_tx(port, ch, 0, tx_ready, put_char, ({}), true, ({})) /** * uart_port_tx_flags -- transmit helper for uart_port with flags * @port: uart port * @ch: variable to store a character to be written to the HW * @flags: %UART_TX_NOSTOP or similar * @tx_ready: can HW accept more data function * @put_char: function to write a character * * See uart_port_tx_limited() for more details. */ #define uart_port_tx_flags(port, ch, flags, tx_ready, put_char) \ __uart_port_tx(port, ch, flags, tx_ready, put_char, ({}), true, ({})) /* * Baud rate helpers. */ void uart_update_timeout(struct uart_port *port, unsigned int cflag, unsigned int baud); unsigned int uart_get_baud_rate(struct uart_port *port, struct ktermios *termios, const struct ktermios *old, unsigned int min, unsigned int max); unsigned int uart_get_divisor(struct uart_port *port, unsigned int baud); /* * Calculates FIFO drain time. */ static inline unsigned long uart_fifo_timeout(struct uart_port *port) { u64 fifo_timeout = (u64)READ_ONCE(port->frame_time) * port->fifosize; /* Add .02 seconds of slop */ fifo_timeout += 20 * NSEC_PER_MSEC; return max(nsecs_to_jiffies(fifo_timeout), 1UL); } /* Base timer interval for polling */ static inline unsigned long uart_poll_timeout(struct uart_port *port) { unsigned long timeout = uart_fifo_timeout(port); return timeout > 6 ? (timeout / 2 - 2) : 1; } /* * Console helpers. */ struct earlycon_device { struct console *con; struct uart_port port; char options[32]; /* e.g., 115200n8 */ unsigned int baud; }; struct earlycon_id { char name[15]; char name_term; /* In case compiler didn't '\0' term name */ char compatible[128]; int (*setup)(struct earlycon_device *, const char *options); }; extern const struct earlycon_id __earlycon_table[]; extern const struct earlycon_id __earlycon_table_end[]; #if defined(CONFIG_SERIAL_EARLYCON) && !defined(MODULE) #define EARLYCON_USED_OR_UNUSED __used #else #define EARLYCON_USED_OR_UNUSED __maybe_unused #endif #define OF_EARLYCON_DECLARE(_name, compat, fn) \ static const struct earlycon_id __UNIQUE_ID(__earlycon_##_name) \ EARLYCON_USED_OR_UNUSED __section("__earlycon_table") \ __aligned(__alignof__(struct earlycon_id)) \ = { .name = __stringify(_name), \ .compatible = compat, \ .setup = fn } #define EARLYCON_DECLARE(_name, fn) OF_EARLYCON_DECLARE(_name, "", fn) int of_setup_earlycon(const struct earlycon_id *match, unsigned long node, const char *options); #ifdef CONFIG_SERIAL_EARLYCON extern bool earlycon_acpi_spcr_enable __initdata; int setup_earlycon(char *buf); #else static const bool earlycon_acpi_spcr_enable EARLYCON_USED_OR_UNUSED; static inline int setup_earlycon(char *buf) { return 0; } #endif /* Variant of uart_console_registered() when the console_list_lock is held. */ static inline bool uart_console_registered_locked(struct uart_port *port) { return uart_console(port) && console_is_registered_locked(port->cons); } static inline bool uart_console_registered(struct uart_port *port) { return uart_console(port) && console_is_registered(port->cons); } struct uart_port *uart_get_console(struct uart_port *ports, int nr, struct console *c); int uart_parse_earlycon(char *p, unsigned char *iotype, resource_size_t *addr, char **options); void uart_parse_options(const char *options, int *baud, int *parity, int *bits, int *flow); int uart_set_options(struct uart_port *port, struct console *co, int baud, int parity, int bits, int flow); struct tty_driver *uart_console_device(struct console *co, int *index); void uart_console_write(struct uart_port *port, const char *s, unsigned int count, void (*putchar)(struct uart_port *, unsigned char)); /* * Port/driver registration/removal */ int uart_register_driver(struct uart_driver *uart); void uart_unregister_driver(struct uart_driver *uart); int uart_add_one_port(struct uart_driver *reg, struct uart_port *port); void uart_remove_one_port(struct uart_driver *reg, struct uart_port *port); int uart_read_port_properties(struct uart_port *port); int uart_read_and_validate_port_properties(struct uart_port *port); bool uart_match_port(const struct uart_port *port1, const struct uart_port *port2); /* * Power Management */ int uart_suspend_port(struct uart_driver *reg, struct uart_port *port); int uart_resume_port(struct uart_driver *reg, struct uart_port *port); static inline int uart_tx_stopped(struct uart_port *port) { struct tty_struct *tty = port->state->port.tty; if ((tty && tty->flow.stopped) || port->hw_stopped) return 1; return 0; } static inline bool uart_cts_enabled(struct uart_port *uport) { return !!(uport->status & UPSTAT_CTS_ENABLE); } static inline bool uart_softcts_mode(struct uart_port *uport) { upstat_t mask = UPSTAT_CTS_ENABLE | UPSTAT_AUTOCTS; return ((uport->status & mask) == UPSTAT_CTS_ENABLE); } /* * The following are helper functions for the low level drivers. */ void uart_handle_dcd_change(struct uart_port *uport, bool active); void uart_handle_cts_change(struct uart_port *uport, bool active); void uart_insert_char(struct uart_port *port, unsigned int status, unsigned int overrun, u8 ch, u8 flag); void uart_xchar_out(struct uart_port *uport, int offset); #ifdef CONFIG_MAGIC_SYSRQ_SERIAL #define SYSRQ_TIMEOUT (HZ * 5) bool uart_try_toggle_sysrq(struct uart_port *port, u8 ch); static inline int uart_handle_sysrq_char(struct uart_port *port, u8 ch) { if (!port->sysrq) return 0; if (ch && time_before(jiffies, port->sysrq)) { if (sysrq_mask()) { handle_sysrq(ch); port->sysrq = 0; return 1; } if (uart_try_toggle_sysrq(port, ch)) return 1; } port->sysrq = 0; return 0; } static inline int uart_prepare_sysrq_char(struct uart_port *port, u8 ch) { if (!port->sysrq) return 0; if (ch && time_before(jiffies, port->sysrq)) { if (sysrq_mask()) { port->sysrq_ch = ch; port->sysrq = 0; return 1; } if (uart_try_toggle_sysrq(port, ch)) return 1; } port->sysrq = 0; return 0; } static inline void uart_unlock_and_check_sysrq(struct uart_port *port) { u8 sysrq_ch; if (!port->has_sysrq) { uart_port_unlock(port); return; } sysrq_ch = port->sysrq_ch; port->sysrq_ch = 0; uart_port_unlock(port); if (sysrq_ch) handle_sysrq(sysrq_ch); } static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port, unsigned long flags) { u8 sysrq_ch; if (!port->has_sysrq) { uart_port_unlock_irqrestore(port, flags); return; } sysrq_ch = port->sysrq_ch; port->sysrq_ch = 0; uart_port_unlock_irqrestore(port, flags); if (sysrq_ch) handle_sysrq(sysrq_ch); } #else /* CONFIG_MAGIC_SYSRQ_SERIAL */ static inline int uart_handle_sysrq_char(struct uart_port *port, u8 ch) { return 0; } static inline int uart_prepare_sysrq_char(struct uart_port *port, u8 ch) { return 0; } static inline void uart_unlock_and_check_sysrq(struct uart_port *port) { uart_port_unlock(port); } static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port, unsigned long flags) { uart_port_unlock_irqrestore(port, flags); } #endif /* CONFIG_MAGIC_SYSRQ_SERIAL */ /* * We do the SysRQ and SAK checking like this... */ static inline int uart_handle_break(struct uart_port *port) { struct uart_state *state = port->state; if (port->handle_break) port->handle_break(port); #ifdef CONFIG_MAGIC_SYSRQ_SERIAL if (port->has_sysrq && uart_console(port)) { if (!port->sysrq) { port->sysrq = jiffies + SYSRQ_TIMEOUT; return 1; } port->sysrq = 0; } #endif if (port->flags & UPF_SAK) do_SAK(state->port.tty); return 0; } /* * UART_ENABLE_MS - determine if port should enable modem status irqs */ #define UART_ENABLE_MS(port,cflag) ((port)->flags & UPF_HARDPPS_CD || \ (cflag) & CRTSCTS || \ !((cflag) & CLOCAL)) int uart_get_rs485_mode(struct uart_port *port); #endif /* LINUX_SERIAL_CORE_H */
24 24 22 2 22 5 21 22 16 2 25 1 23 6 6 6 6 6 3 6 6 6 6 6 6 1 1 1 13 13 1 1 1 1 1 13 13 13 13 24 24 25 25 25 25 13 25 25 24 12 13 6 24 25 24 25 23 24 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 /* * hugetlbpage-backed filesystem. Based on ramfs. * * Nadia Yvette Chambers, 2002 * * Copyright (C) 2002 Linus Torvalds. * License: GPL */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/thread_info.h> #include <asm/current.h> #include <linux/falloc.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/file.h> #include <linux/kernel.h> #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/string.h> #include <linux/capability.h> #include <linux/ctype.h> #include <linux/backing-dev.h> #include <linux/hugetlb.h> #include <linux/pagevec.h> #include <linux/fs_parser.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/dnotify.h> #include <linux/statfs.h> #include <linux/security.h> #include <linux/magic.h> #include <linux/migrate.h> #include <linux/uio.h> #include <linux/uaccess.h> #include <linux/sched/mm.h> #define CREATE_TRACE_POINTS #include <trace/events/hugetlbfs.h> static const struct address_space_operations hugetlbfs_aops; static const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT }; struct hugetlbfs_fs_context { struct hstate *hstate; unsigned long long max_size_opt; unsigned long long min_size_opt; long max_hpages; long nr_inodes; long min_hpages; enum hugetlbfs_size_type max_val_type; enum hugetlbfs_size_type min_val_type; kuid_t uid; kgid_t gid; umode_t mode; }; int sysctl_hugetlb_shm_group; enum hugetlb_param { Opt_gid, Opt_min_size, Opt_mode, Opt_nr_inodes, Opt_pagesize, Opt_size, Opt_uid, }; static const struct fs_parameter_spec hugetlb_fs_parameters[] = { fsparam_gid ("gid", Opt_gid), fsparam_string("min_size", Opt_min_size), fsparam_u32oct("mode", Opt_mode), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("pagesize", Opt_pagesize), fsparam_string("size", Opt_size), fsparam_uid ("uid", Opt_uid), {} }; /* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the * value. The extra bit (- 1 in the shift value) is to take the sign * bit into account. */ #define PGOFF_LOFFT_MAX \ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); vm_flags_t vm_flags; /* * vma address alignment (but not the pgoff alignment) has * already been checked by prepare_hugepage_range. If you add * any error returns here, do so after setting VM_HUGETLB, so * is_vm_hugetlb_page tests below unmap_region go the right * way when do_mmap unwinds (may be important on powerpc * and ia64). */ vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; ret = seal_check_write(info->seals, vma); if (ret) return ret; /* * page based offset in vm_pgoff could be sufficiently large to * overflow a loff_t when converted to byte offset. This can * only happen on architectures where sizeof(loff_t) == * sizeof(unsigned long). So, only check in those instances. */ if (sizeof(unsigned long) == sizeof(loff_t)) { if (vma->vm_pgoff & PGOFF_LOFFT_MAX) return -EINVAL; } /* must be huge page aligned */ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; vma_len = (loff_t)(vma->vm_end - vma->vm_start); len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* check for overflow */ if (len < vma_len) return -EINVAL; inode_lock(inode); file_accessed(file); ret = -ENOMEM; vm_flags = vma->vm_flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode * flag S_PRIVATE is set. */ if (inode->i_flags & S_PRIVATE) vm_flags |= VM_NORESERVE; if (!hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, vm_flags)) goto out; ret = 0; if (vma->vm_flags & VM_WRITE && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); return ret; } /* * Called under mmap_write_lock(mm). */ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long addr0 = 0; struct hstate *h = hstate_file(file); if (len & ~huge_page_mask(h)) return -EINVAL; if (flags & MAP_FIXED) { if (addr & ~huge_page_mask(h)) return -EINVAL; if (prepare_hugepage_range(file, addr, len)) return -EINVAL; } if (addr) addr0 = ALIGN(addr, huge_page_size(h)); return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff, flags, 0); } /* * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. * Returns the maximum number of bytes one can read without touching the 1st raw * HWPOISON subpage. * * The implementation borrows the iteration logic from copy_page_to_iter*. */ static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes) { size_t n = 0; size_t res = 0; /* First subpage to start the loop. */ page = nth_page(page, offset / PAGE_SIZE); offset %= PAGE_SIZE; while (1) { if (is_raw_hwpoison_page_in_hugepage(page)) break; /* Safe to read n bytes without touching HWPOISON subpage. */ n = min(bytes, (size_t)PAGE_SIZE - offset); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page = nth_page(page, 1); offset = 0; } } return res; } /* * Support for read() - Find the page attached to f_mapping and copy out the * data. This provides functionality similar to filemap_read(). */ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct hstate *h = hstate_file(file); struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; unsigned long index = iocb->ki_pos >> huge_page_shift(h); unsigned long offset = iocb->ki_pos & ~huge_page_mask(h); unsigned long end_index; loff_t isize; ssize_t retval = 0; while (iov_iter_count(to)) { struct folio *folio; size_t nr, copied, want; /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); isize = i_size_read(inode); if (!isize) break; end_index = (isize - 1) >> huge_page_shift(h); if (index > end_index) break; if (index == end_index) { nr = ((isize - 1) & ~huge_page_mask(h)) + 1; if (nr <= offset) break; } nr = nr - offset; /* Find the folio */ folio = filemap_lock_hugetlb_folio(h, mapping, index); if (IS_ERR(folio)) { /* * We have a HOLE, zero out the user-buffer for the * length of the hole or request. */ copied = iov_iter_zero(nr, to); } else { folio_unlock(folio); if (!folio_test_hwpoison(folio)) want = nr; else { /* * Adjust how many bytes safe to read without * touching the 1st raw HWPOISON subpage after * offset. */ want = adjust_range_hwpoison(&folio->page, offset, nr); if (want == 0) { folio_put(folio); retval = -EIO; break; } } /* * We have the folio, copy it to user space buffer. */ copied = copy_folio_to_iter(folio, offset, want, to); folio_put(folio); } offset += copied; retval += copied; if (copied != nr && iov_iter_count(to)) { if (!retval) retval = -EFAULT; break; } index += offset >> huge_page_shift(h); offset &= ~huge_page_mask(h); } iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset; return retval; } static int hugetlbfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { return -EINVAL; } static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { BUG(); return -EINVAL; } static void hugetlb_delete_from_page_cache(struct folio *folio) { folio_clear_dirty(folio); folio_clear_uptodate(folio); filemap_remove_folio(folio); } /* * Called with i_mmap_rwsem held for inode based vma maps. This makes * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault * mutex for the page in the mapping. So, we can not race with page being * faulted into the vma. */ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { pte_t *ptep, pte; ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma))); if (!ptep) return false; pte = huge_ptep_get(vma->vm_mm, addr, ptep); if (huge_pte_none(pte) || !pte_present(pte)) return false; if (pte_page(pte) == page) return true; return false; } /* * Can vma_offset_start/vma_offset_end overflow on 32-bit arches? * No, because the interval tree returns us only those vmas * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) { unsigned long offset = 0; if (vma->vm_pgoff < start) offset = (start - vma->vm_pgoff) << PAGE_SHIFT; return vma->vm_start + offset; } static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) { unsigned long t_end; if (!end) return vma->vm_end; t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start; if (t_end > vma->vm_end) t_end = vma->vm_end; return t_end; } /* * Called with hugetlb fault mutex held. Therefore, no more mappings to * this folio can be created while executing the routine. */ static void hugetlb_unmap_file_folio(struct hstate *h, struct address_space *mapping, struct folio *folio, pgoff_t index) { struct rb_root_cached *root = &mapping->i_mmap; struct hugetlb_vma_lock *vma_lock; struct page *page = &folio->page; struct vm_area_struct *vma; unsigned long v_start; unsigned long v_end; pgoff_t start, end; start = index * pages_per_huge_page(h); end = (index + 1) * pages_per_huge_page(h); i_mmap_lock_write(mapping); retry: vma_lock = NULL; vma_interval_tree_foreach(vma, root, start, end - 1) { v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); if (!hugetlb_vma_maps_page(vma, v_start, page)) continue; if (!hugetlb_vma_trylock_write(vma)) { vma_lock = vma->vm_private_data; /* * If we can not get vma lock, we need to drop * immap_sema and take locks in order. First, * take a ref on the vma_lock structure so that * we can be guaranteed it will not go away when * dropping immap_sema. */ kref_get(&vma_lock->refs); break; } unmap_hugepage_range(vma, v_start, v_end, NULL, ZAP_FLAG_DROP_MARKER); hugetlb_vma_unlock_write(vma); } i_mmap_unlock_write(mapping); if (vma_lock) { /* * Wait on vma_lock. We know it is still valid as we have * a reference. We must 'open code' vma locking as we do * not know if vma_lock is still attached to vma. */ down_write(&vma_lock->rw_sema); i_mmap_lock_write(mapping); vma = vma_lock->vma; if (!vma) { /* * If lock is no longer attached to vma, then just * unlock, drop our reference and retry looking for * other vmas. */ up_write(&vma_lock->rw_sema); kref_put(&vma_lock->refs, hugetlb_vma_lock_release); goto retry; } /* * vma_lock is still attached to vma. Check to see if vma * still maps page and if so, unmap. */ v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); if (hugetlb_vma_maps_page(vma, v_start, page)) unmap_hugepage_range(vma, v_start, v_end, NULL, ZAP_FLAG_DROP_MARKER); kref_put(&vma_lock->refs, hugetlb_vma_lock_release); hugetlb_vma_unlock_write(vma); goto retry; } } static void hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, zap_flags_t zap_flags) { struct vm_area_struct *vma; /* * end == 0 indicates that the entire range after start should be * unmapped. Note, end is exclusive, whereas the interval tree takes * an inclusive "last". */ vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { unsigned long v_start; unsigned long v_end; if (!hugetlb_vma_trylock_write(vma)) continue; v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags); /* * Note that vma lock only exists for shared/non-private * vmas. Therefore, lock is not held when calling * unmap_hugepage_range for private vmas. */ hugetlb_vma_unlock_write(vma); } } /* * Called with hugetlb fault mutex held. * Returns true if page was actually removed, false otherwise. */ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, struct address_space *mapping, struct folio *folio, pgoff_t index, bool truncate_op) { bool ret = false; /* * If folio is mapped, it was faulted in after being * unmapped in caller. Unmap (again) while holding * the fault mutex. The mutex will prevent faults * until we finish removing the folio. */ if (unlikely(folio_mapped(folio))) hugetlb_unmap_file_folio(h, mapping, folio, index); folio_lock(folio); /* * We must remove the folio from page cache before removing * the region/ reserve map (hugetlb_unreserve_pages). In * rare out of memory conditions, removal of the region/reserve * map could fail. Correspondingly, the subpool and global * reserve usage count can need to be adjusted. */ VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio); hugetlb_delete_from_page_cache(folio); ret = true; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) hugetlb_fix_reserve_counts(inode); } folio_unlock(folio); return ret; } /* * remove_inode_hugepages handles two distinct cases: truncation and hole * punch. There are subtle differences in operation for each case. * * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve * maps and global counts. Page faults can race with truncation. * During faults, hugetlb_no_page() checks i_size before page allocation, * and again after obtaining page table lock. It will 'back out' * allocations in the truncated range. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserve map * deleted. The region/reserve map for ranges without associated * pages are not modified. Page faults can race with hole punch. * This is indicated if we find a mapped page. * Note: If the passed end of range value is beyond the end of file, but * not LLONG_MAX this routine still performs a hole punch operation. */ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, loff_t lend) { struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; const pgoff_t end = lend >> PAGE_SHIFT; struct folio_batch fbatch; pgoff_t next, index; int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); folio_batch_init(&fbatch); next = lstart >> PAGE_SHIFT; while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); ++i) { struct folio *folio = fbatch.folios[i]; u32 hash = 0; index = folio->index >> huge_page_order(h); hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* * Remove folio that was part of folio_batch. */ if (remove_inode_single_folio(h, inode, mapping, folio, index, truncate_op)) freed++; mutex_unlock(&hugetlb_fault_mutex_table[hash]); } folio_batch_release(&fbatch); cond_resched(); } if (truncate_op) (void)hugetlb_unreserve_pages(inode, lstart >> huge_page_shift(h), LONG_MAX, freed); } static void hugetlbfs_evict_inode(struct inode *inode) { struct resv_map *resv_map; trace_hugetlbfs_evict_inode(inode); remove_inode_hugepages(inode, 0, LLONG_MAX); /* * Get the resv_map from the address space embedded in the inode. * This is the address space which points to any resv_map allocated * at inode creation time. If this is a device special inode, * i_mapping may not point to the original address space. */ resv_map = (struct resv_map *)(&inode->i_data)->i_private_data; /* Only regular and link inodes have associated reserve maps */ if (resv_map) resv_map_release(&resv_map->refs); clear_inode(inode); } static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) { pgoff_t pgoff; struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); BUG_ON(offset & ~huge_page_mask(h)); pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, ZAP_FLAG_DROP_MARKER); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); } static void hugetlbfs_zero_partial_page(struct hstate *h, struct address_space *mapping, loff_t start, loff_t end) { pgoff_t idx = start >> huge_page_shift(h); struct folio *folio; folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) return; start = start & ~huge_page_mask(h); end = end & ~huge_page_mask(h); if (!end) end = huge_page_size(h); folio_zero_segment(folio, (size_t)start, (size_t)end); folio_unlock(folio); folio_put(folio); } static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); loff_t hpage_size = huge_page_size(h); loff_t hole_start, hole_end; /* * hole_start and hole_end indicate the full pages within the hole. */ hole_start = round_up(offset, hpage_size); hole_end = round_down(offset + len, hpage_size); inode_lock(inode); /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { inode_unlock(inode); return -EPERM; } i_mmap_lock_write(mapping); /* If range starts before first full page, zero partial page. */ if (offset < hole_start) hugetlbfs_zero_partial_page(h, mapping, offset, min(offset + len, hole_start)); /* Unmap users of full pages in the hole. */ if (hole_end > hole_start) { if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT, 0); } /* If range extends beyond last full page, zero partial page. */ if ((offset + len) > hole_end && (offset + len) > hole_start) hugetlbfs_zero_partial_page(h, mapping, hole_end, offset + len); i_mmap_unlock_write(mapping); /* Remove full pages from the file. */ if (hole_end > hole_start) remove_inode_hugepages(inode, hole_start, hole_end); inode_unlock(inode); return 0; } static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); struct vm_area_struct pseudo_vma; struct mm_struct *mm = current->mm; loff_t hpage_size = huge_page_size(h); unsigned long hpage_shift = huge_page_shift(h); pgoff_t start, index, end; int error; u32 hash; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) { error = hugetlbfs_punch_hole(inode, offset, len); goto out_nolock; } /* * Default preallocate case. * For this range, start is rounded down and end is rounded up * as well as being converted to page offsets. */ start = offset >> hpage_shift; end = (offset + len + hpage_size - 1) >> hpage_shift; inode_lock(inode); /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); if (error) goto out; if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { error = -EPERM; goto out; } /* * Initialize a pseudo vma as this is required by the huge page * allocation routines. */ vma_init(&pseudo_vma, mm); vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; for (index = start; index < end; index++) { /* * This is supposed to be the vaddr where the page is being * faulted in, but we have no vaddr here. */ struct folio *folio; unsigned long addr; cond_resched(); /* * fallocate(2) manpage permits EINTR; we may have been * interrupted because we are using up too much memory. */ if (signal_pending(current)) { error = -EINTR; break; } /* addr is the offset within the file (zero based) */ addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ folio = filemap_get_folio(mapping, index << huge_page_order(h)); if (!IS_ERR(folio)) { folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); continue; } /* * Allocate folio without setting the avoid_reserve argument. * There certainly are no reserves associated with the * pseudo_vma. However, there could be shared mappings with * reserves for the file at the inode level. If we fallocate * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); error = PTR_ERR(folio); goto out; } folio_zero_user(folio, ALIGN_DOWN(addr, hpage_size)); __folio_mark_uptodate(folio); error = hugetlb_add_to_page_cache(folio, mapping, index); if (unlikely(error)) { restore_reserve_on_error(h, &pseudo_vma, addr, folio); folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; } mutex_unlock(&hugetlb_fault_mutex_table[hash]); folio_set_hugetlb_migratable(folio); /* * folio_unlock because locked by hugetlb_add_to_page_cache() * folio_put() due to reference from alloc_hugetlb_folio() */ folio_unlock(folio); folio_put(folio); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); inode_set_ctime_current(inode); out: inode_unlock(inode); out_nolock: trace_hugetlbfs_fallocate(inode, mode, offset, len, error); return error; } static int hugetlbfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct hstate *h = hstate_inode(inode); int error; unsigned int ia_valid = attr->ia_valid; struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); error = setattr_prepare(idmap, dentry, attr); if (error) return error; trace_hugetlbfs_setattr(inode, dentry, attr); if (ia_valid & ATTR_SIZE) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; if (newsize & ~huge_page_mask(h)) return -EINVAL; /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; hugetlb_vmtruncate(inode, newsize); } setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); return 0; } static struct inode *hugetlbfs_get_root(struct super_block *sb, struct hugetlbfs_fs_context *ctx) { struct inode *inode; inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = S_IFDIR | ctx->mode; inode->i_uid = ctx->uid; inode->i_gid = ctx->gid; simple_inode_init_ts(inode); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); lockdep_annotate_inode_mutex_key(inode); } return inode; } /* * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never * be taken from reclaim -- unlike regular filesystems. This needs an * annotation because huge_pmd_share() does an allocation under hugetlb's * i_mmap_rwsem. */ static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct mnt_idmap *idmap, struct inode *dir, umode_t mode, dev_t dev) { struct inode *inode; struct resv_map *resv_map = NULL; /* * Reserve maps are only needed for inodes that can have associated * page allocations. */ if (S_ISREG(mode) || S_ISLNK(mode)) { resv_map = resv_map_alloc(); if (!resv_map) return NULL; } inode = new_inode(sb); if (inode) { struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode->i_ino = get_next_ino(); inode_init_owner(idmap, inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; simple_inode_init_ts(inode); inode->i_mapping->i_private_data = resv_map; info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_op = &hugetlbfs_inode_operations; inode->i_fop = &hugetlbfs_file_operations; break; case S_IFDIR: inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); break; } lockdep_annotate_inode_mutex_key(inode); trace_hugetlbfs_alloc_inode(inode, dir, mode); } else { if (resv_map) kref_put(&resv_map->refs, resv_map_release); } return inode; } /* * File creation. Allocate an inode, and we're done.. */ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev); if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); d_instantiate(dentry, inode); dget(dentry);/* Extra count - pin the dentry in core */ return 0; } static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } static int hugetlbfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0); if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); d_tmpfile(file, inode); return finish_open_simple(file, 0); } static int hugetlbfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { const umode_t mode = S_IFLNK|S_IRWXUGO; struct inode *inode; int error = -ENOSPC; inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0); if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); if (!error) { d_instantiate(dentry, inode); dget(dentry); } else iput(inode); } inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); return error; } #ifdef CONFIG_MIGRATION static int hugetlbfs_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { int rc; rc = migrate_huge_page_move_mapping(mapping, dst, src); if (rc != MIGRATEPAGE_SUCCESS) return rc; if (hugetlb_folio_subpool(src)) { hugetlb_set_folio_subpool(dst, hugetlb_folio_subpool(src)); hugetlb_set_folio_subpool(src, NULL); } folio_migrate_flags(dst, src); return MIGRATEPAGE_SUCCESS; } #else #define hugetlbfs_migrate_folio NULL #endif static int hugetlbfs_error_remove_folio(struct address_space *mapping, struct folio *folio) { return 0; } /* * Display the mount options in /proc/mounts. */ static int hugetlbfs_show_options(struct seq_file *m, struct dentry *root) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(root->d_sb); struct hugepage_subpool *spool = sbinfo->spool; unsigned long hpage_size = huge_page_size(sbinfo->hstate); unsigned hpage_shift = huge_page_shift(sbinfo->hstate); char mod; if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, sbinfo->uid)); if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); if (sbinfo->mode != 0755) seq_printf(m, ",mode=%o", sbinfo->mode); if (sbinfo->max_inodes != -1) seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes); hpage_size /= 1024; mod = 'K'; if (hpage_size >= 1024) { hpage_size /= 1024; mod = 'M'; } seq_printf(m, ",pagesize=%lu%c", hpage_size, mod); if (spool) { if (spool->max_hpages != -1) seq_printf(m, ",size=%llu", (unsigned long long)spool->max_hpages << hpage_shift); if (spool->min_hpages != -1) seq_printf(m, ",min_size=%llu", (unsigned long long)spool->min_hpages << hpage_shift); } return 0; } static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); struct hstate *h = hstate_inode(d_inode(dentry)); u64 id = huge_encode_dev(dentry->d_sb->s_dev); buf->f_fsid = u64_to_fsid(id); buf->f_type = HUGETLBFS_MAGIC; buf->f_bsize = huge_page_size(h); if (sbinfo) { spin_lock(&sbinfo->stat_lock); /* If no limits set, just report 0 or -1 for max/free/used * blocks, like simple_statfs() */ if (sbinfo->spool) { long free_pages; spin_lock_irq(&sbinfo->spool->lock); buf->f_blocks = sbinfo->spool->max_hpages; free_pages = sbinfo->spool->max_hpages - sbinfo->spool->used_hpages; buf->f_bavail = buf->f_bfree = free_pages; spin_unlock_irq(&sbinfo->spool->lock); buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_inodes; } spin_unlock(&sbinfo->stat_lock); } buf->f_namelen = NAME_MAX; return 0; } static void hugetlbfs_put_super(struct super_block *sb) { struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); if (sbi) { sb->s_fs_info = NULL; if (sbi->spool) hugepage_put_subpool(sbi->spool); kfree(sbi); } } static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) { if (sbinfo->free_inodes >= 0) { spin_lock(&sbinfo->stat_lock); if (unlikely(!sbinfo->free_inodes)) { spin_unlock(&sbinfo->stat_lock); return 0; } sbinfo->free_inodes--; spin_unlock(&sbinfo->stat_lock); } return 1; } static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) { if (sbinfo->free_inodes >= 0) { spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; spin_unlock(&sbinfo->stat_lock); } } static struct kmem_cache *hugetlbfs_inode_cachep; static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); struct hugetlbfs_inode_info *p; if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) return NULL; p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL); if (unlikely(!p)) { hugetlbfs_inc_free_inodes(sbinfo); return NULL; } return &p->vfs_inode; } static void hugetlbfs_free_inode(struct inode *inode) { trace_hugetlbfs_free_inode(inode); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); } static const struct address_space_operations hugetlbfs_aops = { .write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, .dirty_folio = noop_dirty_folio, .migrate_folio = hugetlbfs_migrate_folio, .error_remove_folio = hugetlbfs_error_remove_folio, }; static void init_once(void *foo) { struct hugetlbfs_inode_info *ei = foo; inode_init_once(&ei->vfs_inode); } static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, .fallocate = hugetlbfs_fallocate, .fop_flags = FOP_HUGE_PAGES, }; static const struct inode_operations hugetlbfs_dir_inode_operations = { .create = hugetlbfs_create, .lookup = simple_lookup, .link = simple_link, .unlink = simple_unlink, .symlink = hugetlbfs_symlink, .mkdir = hugetlbfs_mkdir, .rmdir = simple_rmdir, .mknod = hugetlbfs_mknod, .rename = simple_rename, .setattr = hugetlbfs_setattr, .tmpfile = hugetlbfs_tmpfile, }; static const struct inode_operations hugetlbfs_inode_operations = { .setattr = hugetlbfs_setattr, }; static const struct super_operations hugetlbfs_ops = { .alloc_inode = hugetlbfs_alloc_inode, .free_inode = hugetlbfs_free_inode, .destroy_inode = hugetlbfs_destroy_inode, .evict_inode = hugetlbfs_evict_inode, .statfs = hugetlbfs_statfs, .put_super = hugetlbfs_put_super, .show_options = hugetlbfs_show_options, }; /* * Convert size option passed from command line to number of huge pages * in the pool specified by hstate. Size option could be in bytes * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). */ static long hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, enum hugetlbfs_size_type val_type) { if (val_type == NO_SIZE) return -1; if (val_type == SIZE_PERCENT) { size_opt <<= huge_page_shift(h); size_opt *= h->max_huge_pages; do_div(size_opt, 100); } size_opt >>= huge_page_shift(h); return size_opt; } /* * Parse one mount parameter. */ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct hugetlbfs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; struct hstate *h; char *rest; unsigned long ps; int opt; opt = fs_parse(fc, hugetlb_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_uid: ctx->uid = result.uid; return 0; case Opt_gid: ctx->gid = result.gid; return 0; case Opt_mode: ctx->mode = result.uint_32 & 01777U; return 0; case Opt_size: /* memparse() will accept a K/M/G without a digit */ if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->max_size_opt = memparse(param->string, &rest); ctx->max_val_type = SIZE_STD; if (*rest == '%') ctx->max_val_type = SIZE_PERCENT; return 0; case Opt_nr_inodes: /* memparse() will accept a K/M/G without a digit */ if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->nr_inodes = memparse(param->string, &rest); return 0; case Opt_pagesize: ps = memparse(param->string, &rest); h = size_to_hstate(ps); if (!h) { pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL; } ctx->hstate = h; return 0; case Opt_min_size: /* memparse() will accept a K/M/G without a digit */ if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->min_size_opt = memparse(param->string, &rest); ctx->min_val_type = SIZE_STD; if (*rest == '%') ctx->min_val_type = SIZE_PERCENT; return 0; default: return -EINVAL; } bad_val: return invalfc(fc, "Bad value '%s' for mount option '%s'\n", param->string, param->key); } /* * Validate the parsed options. */ static int hugetlbfs_validate(struct fs_context *fc) { struct hugetlbfs_fs_context *ctx = fc->fs_private; /* * Use huge page pool size (in hstate) to convert the size * options to number of huge pages. If NO_SIZE, -1 is returned. */ ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate, ctx->max_size_opt, ctx->max_val_type); ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate, ctx->min_size_opt, ctx->min_val_type); /* * If max_size was specified, then min_size must be smaller */ if (ctx->max_val_type > NO_SIZE && ctx->min_hpages > ctx->max_hpages) { pr_err("Minimum size can not be greater than maximum size\n"); return -EINVAL; } return 0; } static int hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct hugetlbfs_fs_context *ctx = fc->fs_private; struct hugetlbfs_sb_info *sbinfo; sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); if (!sbinfo) return -ENOMEM; sb->s_fs_info = sbinfo; spin_lock_init(&sbinfo->stat_lock); sbinfo->hstate = ctx->hstate; sbinfo->max_inodes = ctx->nr_inodes; sbinfo->free_inodes = ctx->nr_inodes; sbinfo->spool = NULL; sbinfo->uid = ctx->uid; sbinfo->gid = ctx->gid; sbinfo->mode = ctx->mode; /* * Allocate and initialize subpool if maximum or minimum size is * specified. Any needed reservations (for minimum size) are taken * when the subpool is created. */ if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { sbinfo->spool = hugepage_new_subpool(ctx->hstate, ctx->max_hpages, ctx->min_hpages); if (!sbinfo->spool) goto out_free; } sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = huge_page_size(ctx->hstate); sb->s_blocksize_bits = huge_page_shift(ctx->hstate); sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; /* * Due to the special and limited functionality of hugetlbfs, it does * not work well as a stacking filesystem. */ sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx)); if (!sb->s_root) goto out_free; return 0; out_free: kfree(sbinfo->spool); kfree(sbinfo); return -ENOMEM; } static int hugetlbfs_get_tree(struct fs_context *fc) { int err = hugetlbfs_validate(fc); if (err) return err; return get_tree_nodev(fc, hugetlbfs_fill_super); } static void hugetlbfs_fs_context_free(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations hugetlbfs_fs_context_ops = { .free = hugetlbfs_fs_context_free, .parse_param = hugetlbfs_parse_param, .get_tree = hugetlbfs_get_tree, }; static int hugetlbfs_init_fs_context(struct fs_context *fc) { struct hugetlbfs_fs_context *ctx; ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->max_hpages = -1; /* No limit on size by default */ ctx->nr_inodes = -1; /* No limit on number of inodes by default */ ctx->uid = current_fsuid(); ctx->gid = current_fsgid(); ctx->mode = 0755; ctx->hstate = &default_hstate; ctx->min_hpages = -1; /* No default minimum size */ ctx->max_val_type = NO_SIZE; ctx->min_val_type = NO_SIZE; fc->fs_private = ctx; fc->ops = &hugetlbfs_fs_context_ops; return 0; } static struct file_system_type hugetlbfs_fs_type = { .name = "hugetlbfs", .init_fs_context = hugetlbfs_init_fs_context, .parameters = hugetlb_fs_parameters, .kill_sb = kill_litter_super, .fs_flags = FS_ALLOW_IDMAP, }; static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; static int can_do_hugetlb_shm(void) { kgid_t shm_group; shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); return capable(CAP_IPC_LOCK) || in_group_p(shm_group); } static int get_hstate_idx(int page_size_log) { struct hstate *h = hstate_sizelog(page_size_log); if (!h) return -1; return hstate_index(h); } /* * Note that size should be aligned to proper hugepage size in caller side, * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. */ struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, int creat_flags, int page_size_log) { struct inode *inode; struct vfsmount *mnt; int hstate_idx; struct file *file; hstate_idx = get_hstate_idx(page_size_log); if (hstate_idx < 0) return ERR_PTR(-ENODEV); mnt = hugetlbfs_vfsmount[hstate_idx]; if (!mnt) return ERR_PTR(-ENOENT); if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { struct ucounts *ucounts = current_ucounts(); if (user_shm_lock(size, ucounts)) { pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n", current->comm, current->pid); user_shm_unlock(size, ucounts); } return ERR_PTR(-EPERM); } file = ERR_PTR(-ENOSPC); /* hugetlbfs_vfsmount[] mounts do not use idmapped mounts. */ inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) goto out; if (creat_flags == HUGETLB_SHMFS_INODE) inode->i_flags |= S_PRIVATE; inode->i_size = size; clear_nlink(inode); if (!hugetlb_reserve_pages(inode, 0, size >> huge_page_shift(hstate_inode(inode)), NULL, acctflag)) file = ERR_PTR(-ENOMEM); else file = alloc_file_pseudo(inode, mnt, name, O_RDWR, &hugetlbfs_file_operations); if (!IS_ERR(file)) return file; iput(inode); out: return file; } static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) { struct fs_context *fc; struct vfsmount *mnt; fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT); if (IS_ERR(fc)) { mnt = ERR_CAST(fc); } else { struct hugetlbfs_fs_context *ctx = fc->fs_private; ctx->hstate = h; mnt = fc_mount(fc); put_fs_context(fc); } if (IS_ERR(mnt)) pr_err("Cannot mount internal hugetlbfs for page size %luK", huge_page_size(h) / SZ_1K); return mnt; } static int __init init_hugetlbfs_fs(void) { struct vfsmount *mnt; struct hstate *h; int error; int i; if (!hugepages_supported()) { pr_info("disabling because there are no supported hugepage sizes\n"); return -ENOTSUPP; } error = -ENOMEM; hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), 0, SLAB_ACCOUNT, init_once); if (hugetlbfs_inode_cachep == NULL) goto out; error = register_filesystem(&hugetlbfs_fs_type); if (error) goto out_free; /* default hstate mount is required */ mnt = mount_one_hugetlbfs(&default_hstate); if (IS_ERR(mnt)) { error = PTR_ERR(mnt); goto out_unreg; } hugetlbfs_vfsmount[default_hstate_idx] = mnt; /* other hstates are optional */ i = 0; for_each_hstate(h) { if (i == default_hstate_idx) { i++; continue; } mnt = mount_one_hugetlbfs(h); if (IS_ERR(mnt)) hugetlbfs_vfsmount[i] = NULL; else hugetlbfs_vfsmount[i] = mnt; i++; } return 0; out_unreg: (void)unregister_filesystem(&hugetlbfs_fs_type); out_free: kmem_cache_destroy(hugetlbfs_inode_cachep); out: return error; } fs_initcall(init_hugetlbfs_fs)
3 1 5 4 5 1 1 3 3 3 2 3 3 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,mark type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support */ /* 2 skbinfo support */ #define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>"); IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,mark"); /* Type specific function prefix */ #define HTYPE hash_ipmark #define IP_SET_HASH_WITH_MARKMASK /* IPv4 variant */ /* Member elements */ struct hash_ipmark4_elem { __be32 ip; __u32 mark; }; /* Common functions */ static bool hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, const struct hash_ipmark4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->mark == ip2->mark; } static bool hash_ipmark4_data_list(struct sk_buff *skb, const struct hash_ipmark4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipmark4_data_next(struct hash_ipmark4_elem *next, const struct hash_ipmark4_elem *d) { next->ip = d->ip; } #define MTYPE hash_ipmark4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipmark4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); e.mark = skb->mark; e.mark &= h->markmask; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipmark4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (e.mark == 0 && e.ip == 0) return -IPSET_ERR_HASH_ELEM; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) { if (e.mark == 0 && ip_to == 0) return -IPSET_ERR_HASH_ELEM; swap(ip, ip_to); } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++, i++) { e.ip = htonl(ip); if (i > IPSET_MAX_RANGE) { hash_ipmark4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } /* IPv6 variant */ struct hash_ipmark6_elem { union nf_inet_addr ip; __u32 mark; }; /* Common functions */ static bool hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, const struct hash_ipmark6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->mark == ip2->mark; } static bool hash_ipmark6_data_list(struct sk_buff *skb, const struct hash_ipmark6_elem *data) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipmark6_data_next(struct hash_ipmark6_elem *next, const struct hash_ipmark6_elem *d) { } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipmark6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipmark6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); e.mark = skb->mark; e.mark &= h->markmask; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipmark6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (adt == IPSET_TEST) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; return 0; } static struct ip_set_type hash_ipmark_type __read_mostly = { .name = "hash:ip,mark", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_MARK, .dimension = IPSET_DIM_TWO, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipmark_create, .create_policy = { [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 }, [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_MARK] = { .type = NLA_U32 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipmark_init(void) { return ip_set_type_register(&hash_ipmark_type); } static void __exit hash_ipmark_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipmark_type); } module_init(hash_ipmark_init); module_exit(hash_ipmark_fini);
1 1 2 2 2 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 // SPDX-License-Identifier: GPL-2.0-only /* * pcrypt - Parallel crypto wrapper. * * Copyright (C) 2009 secunet Security Networks AG * Copyright (C) 2009 Steffen Klassert <steffen.klassert@secunet.com> */ #include <crypto/algapi.h> #include <crypto/internal/aead.h> #include <linux/atomic.h> #include <linux/err.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/kobject.h> #include <linux/cpu.h> #include <crypto/pcrypt.h> static struct padata_instance *pencrypt; static struct padata_instance *pdecrypt; static struct kset *pcrypt_kset; struct pcrypt_instance_ctx { struct crypto_aead_spawn spawn; struct padata_shell *psenc; struct padata_shell *psdec; atomic_t tfm_count; }; struct pcrypt_aead_ctx { struct crypto_aead *child; unsigned int cb_cpu; }; static inline struct pcrypt_instance_ctx *pcrypt_tfm_ictx( struct crypto_aead *tfm) { return aead_instance_ctx(aead_alg_instance(tfm)); } static int pcrypt_aead_setkey(struct crypto_aead *parent, const u8 *key, unsigned int keylen) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(parent); return crypto_aead_setkey(ctx->child, key, keylen); } static int pcrypt_aead_setauthsize(struct crypto_aead *parent, unsigned int authsize) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(parent); return crypto_aead_setauthsize(ctx->child, authsize); } static void pcrypt_aead_serial(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); aead_request_complete(req->base.data, padata->info); } static void pcrypt_aead_done(void *data, int err) { struct aead_request *req = data; struct pcrypt_request *preq = aead_request_ctx(req); struct padata_priv *padata = pcrypt_request_padata(preq); padata->info = err; padata_do_serial(padata); } static void pcrypt_aead_enc(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); int ret; ret = crypto_aead_encrypt(req); if (ret == -EINPROGRESS) return; padata->info = ret; padata_do_serial(padata); } static int pcrypt_aead_encrypt(struct aead_request *req) { int err; struct pcrypt_request *preq = aead_request_ctx(req); struct aead_request *creq = pcrypt_request_ctx(preq); struct padata_priv *padata = pcrypt_request_padata(preq); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(aead); u32 flags = aead_request_flags(req); struct pcrypt_instance_ctx *ictx; ictx = pcrypt_tfm_ictx(aead); memset(padata, 0, sizeof(struct padata_priv)); padata->parallel = pcrypt_aead_enc; padata->serial = pcrypt_aead_serial; aead_request_set_tfm(creq, ctx->child); aead_request_set_callback(creq, flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, pcrypt_aead_done, req); aead_request_set_crypt(creq, req->src, req->dst, req->cryptlen, req->iv); aead_request_set_ad(creq, req->assoclen); err = padata_do_parallel(ictx->psenc, padata, &ctx->cb_cpu); if (!err) return -EINPROGRESS; if (err == -EBUSY) { /* try non-parallel mode */ return crypto_aead_encrypt(creq); } return err; } static void pcrypt_aead_dec(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); int ret; ret = crypto_aead_decrypt(req); if (ret == -EINPROGRESS) return; padata->info = ret; padata_do_serial(padata); } static int pcrypt_aead_decrypt(struct aead_request *req) { int err; struct pcrypt_request *preq = aead_request_ctx(req); struct aead_request *creq = pcrypt_request_ctx(preq); struct padata_priv *padata = pcrypt_request_padata(preq); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(aead); u32 flags = aead_request_flags(req); struct pcrypt_instance_ctx *ictx; ictx = pcrypt_tfm_ictx(aead); memset(padata, 0, sizeof(struct padata_priv)); padata->parallel = pcrypt_aead_dec; padata->serial = pcrypt_aead_serial; aead_request_set_tfm(creq, ctx->child); aead_request_set_callback(creq, flags & ~CRYPTO_TFM_REQ_MAY_SLEEP, pcrypt_aead_done, req); aead_request_set_crypt(creq, req->src, req->dst, req->cryptlen, req->iv); aead_request_set_ad(creq, req->assoclen); err = padata_do_parallel(ictx->psdec, padata, &ctx->cb_cpu); if (!err) return -EINPROGRESS; if (err == -EBUSY) { /* try non-parallel mode */ return crypto_aead_decrypt(creq); } return err; } static int pcrypt_aead_init_tfm(struct crypto_aead *tfm) { int cpu, cpu_index; struct aead_instance *inst = aead_alg_instance(tfm); struct pcrypt_instance_ctx *ictx = aead_instance_ctx(inst); struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *cipher; cpu_index = (unsigned int)atomic_inc_return(&ictx->tfm_count) % cpumask_weight(cpu_online_mask); ctx->cb_cpu = cpumask_first(cpu_online_mask); for (cpu = 0; cpu < cpu_index; cpu++) ctx->cb_cpu = cpumask_next(ctx->cb_cpu, cpu_online_mask); cipher = crypto_spawn_aead(&ictx->spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; crypto_aead_set_reqsize(tfm, sizeof(struct pcrypt_request) + sizeof(struct aead_request) + crypto_aead_reqsize(cipher)); return 0; } static void pcrypt_aead_exit_tfm(struct crypto_aead *tfm) { struct pcrypt_aead_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); } static void pcrypt_free(struct aead_instance *inst) { struct pcrypt_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_aead(&ctx->spawn); padata_free_shell(ctx->psdec); padata_free_shell(ctx->psenc); kfree(inst); } static int pcrypt_init_instance(struct crypto_instance *inst, struct crypto_alg *alg) { if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, "pcrypt(%s)", alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; memcpy(inst->alg.cra_name, alg->cra_name, CRYPTO_MAX_ALG_NAME); inst->alg.cra_priority = alg->cra_priority + 100; inst->alg.cra_blocksize = alg->cra_blocksize; inst->alg.cra_alignmask = alg->cra_alignmask; return 0; } static int pcrypt_create_aead(struct crypto_template *tmpl, struct rtattr **tb, struct crypto_attr_type *algt) { struct pcrypt_instance_ctx *ctx; struct aead_instance *inst; struct aead_alg *alg; u32 mask = crypto_algt_inherited_mask(algt); int err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; err = -ENOMEM; ctx = aead_instance_ctx(inst); ctx->psenc = padata_alloc_shell(pencrypt); if (!ctx->psenc) goto err_free_inst; ctx->psdec = padata_alloc_shell(pdecrypt); if (!ctx->psdec) goto err_free_inst; err = crypto_grab_aead(&ctx->spawn, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(&ctx->spawn); err = pcrypt_init_instance(aead_crypto_instance(inst), &alg->base); if (err) goto err_free_inst; inst->alg.base.cra_flags |= CRYPTO_ALG_ASYNC; inst->alg.ivsize = crypto_aead_alg_ivsize(alg); inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(alg); inst->alg.base.cra_ctxsize = sizeof(struct pcrypt_aead_ctx); inst->alg.init = pcrypt_aead_init_tfm; inst->alg.exit = pcrypt_aead_exit_tfm; inst->alg.setkey = pcrypt_aead_setkey; inst->alg.setauthsize = pcrypt_aead_setauthsize; inst->alg.encrypt = pcrypt_aead_encrypt; inst->alg.decrypt = pcrypt_aead_decrypt; inst->free = pcrypt_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: pcrypt_free(inst); } return err; } static int pcrypt_create(struct crypto_template *tmpl, struct rtattr **tb) { struct crypto_attr_type *algt; algt = crypto_get_attr_type(tb); if (IS_ERR(algt)) return PTR_ERR(algt); switch (algt->type & algt->mask & CRYPTO_ALG_TYPE_MASK) { case CRYPTO_ALG_TYPE_AEAD: return pcrypt_create_aead(tmpl, tb, algt); } return -EINVAL; } static int pcrypt_sysfs_add(struct padata_instance *pinst, const char *name) { int ret; pinst->kobj.kset = pcrypt_kset; ret = kobject_add(&pinst->kobj, NULL, "%s", name); if (!ret) kobject_uevent(&pinst->kobj, KOBJ_ADD); return ret; } static int pcrypt_init_padata(struct padata_instance **pinst, const char *name) { int ret = -ENOMEM; *pinst = padata_alloc(name); if (!*pinst) return ret; ret = pcrypt_sysfs_add(*pinst, name); if (ret) padata_free(*pinst); return ret; } static struct crypto_template pcrypt_tmpl = { .name = "pcrypt", .create = pcrypt_create, .module = THIS_MODULE, }; static int __init pcrypt_init(void) { int err = -ENOMEM; pcrypt_kset = kset_create_and_add("pcrypt", NULL, kernel_kobj); if (!pcrypt_kset) goto err; err = pcrypt_init_padata(&pencrypt, "pencrypt"); if (err) goto err_unreg_kset; err = pcrypt_init_padata(&pdecrypt, "pdecrypt"); if (err) goto err_deinit_pencrypt; return crypto_register_template(&pcrypt_tmpl); err_deinit_pencrypt: padata_free(pencrypt); err_unreg_kset: kset_unregister(pcrypt_kset); err: return err; } static void __exit pcrypt_exit(void) { crypto_unregister_template(&pcrypt_tmpl); padata_free(pencrypt); padata_free(pdecrypt); kset_unregister(pcrypt_kset); } subsys_initcall(pcrypt_init); module_exit(pcrypt_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_DESCRIPTION("Parallel crypto wrapper"); MODULE_ALIAS_CRYPTO("pcrypt");
564 72 55 17 7 5 62 34 57 1 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 // SPDX-License-Identifier: GPL-2.0 #include <net/ip.h> #include <net/udp.h> #include <net/udplite.h> #include <asm/checksum.h> #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum csum) { int carry; __u32 ulen; __u32 uproto; __u32 sum = (__force u32)csum; sum += (__force u32)saddr->s6_addr32[0]; carry = (sum < (__force u32)saddr->s6_addr32[0]); sum += carry; sum += (__force u32)saddr->s6_addr32[1]; carry = (sum < (__force u32)saddr->s6_addr32[1]); sum += carry; sum += (__force u32)saddr->s6_addr32[2]; carry = (sum < (__force u32)saddr->s6_addr32[2]); sum += carry; sum += (__force u32)saddr->s6_addr32[3]; carry = (sum < (__force u32)saddr->s6_addr32[3]); sum += carry; sum += (__force u32)daddr->s6_addr32[0]; carry = (sum < (__force u32)daddr->s6_addr32[0]); sum += carry; sum += (__force u32)daddr->s6_addr32[1]; carry = (sum < (__force u32)daddr->s6_addr32[1]); sum += carry; sum += (__force u32)daddr->s6_addr32[2]; carry = (sum < (__force u32)daddr->s6_addr32[2]); sum += carry; sum += (__force u32)daddr->s6_addr32[3]; carry = (sum < (__force u32)daddr->s6_addr32[3]); sum += carry; ulen = (__force u32)htonl((__u32) len); sum += ulen; carry = (sum < ulen); sum += carry; uproto = (__force u32)htonl(proto); sum += uproto; carry = (sum < uproto); sum += carry; return csum_fold((__force __wsum)sum); } EXPORT_SYMBOL(csum_ipv6_magic); #endif int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { int err; UDP_SKB_CB(skb)->partial_cov = 0; UDP_SKB_CB(skb)->cscov = skb->len; if (proto == IPPROTO_UDPLITE) { err = udplite_checksum_init(skb, uh); if (err) return err; if (UDP_SKB_CB(skb)->partial_cov) { skb->csum = ip6_compute_pseudo(skb, proto); return 0; } } /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) * we accept a checksum of zero here. When we find the socket * for the UDP packet we'll check if that socket allows zero checksum * for IPv6 (set by socket option). * * Note, we are only interested in != 0 or == 0, thus the * force to int. */ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, ip6_compute_pseudo); if (err) return err; if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { /* If SW calculated the value, we know it's bad */ if (skb->csum_complete_sw) return 1; /* HW says the value is bad. Let's validate that. * skb->csum is no longer the full packet checksum, * so don't treat is as such. */ skb_checksum_complete_unset(skb); } return 0; } EXPORT_SYMBOL(udp6_csum_init); /* Function to set UDP checksum for an IPv6 UDP packet. This is intended * for the simple case like when setting the checksum for a UDP tunnel. */ void udp6_set_csum(bool nocheck, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len) { struct udphdr *uh = udp_hdr(skb); if (nocheck) uh->check = 0; else if (skb_is_gso(skb)) uh->check = ~udp_v6_check(len, saddr, daddr, 0); else if (skb->ip_summed == CHECKSUM_PARTIAL) { uh->check = 0; uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v6_check(len, saddr, daddr, 0); } } EXPORT_SYMBOL(udp6_set_csum);
10 3 4 8 8 8 33 3 30 17 17 34 34 33 34 32 33 33 20 15 33 5 29 2 12 16 16 14 2 15 10 6 5 15 28 28 27 10 10 10 6 4 17 22 22 21 22 22 3 3 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPV6 GSO/GRO offload support * Linux INET6 implementation */ #include <linux/kernel.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/printk.h> #include <net/protocol.h> #include <net/ipv6.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/gro.h> #include <net/gso.h> #include "ip6_offload.h" /* All GRO functions are always builtin, except UDP over ipv6, which lays in * ipv6 module, as it depends on UDPv6 lookup function, so we need special care * when ipv6 is built as a module */ #if IS_BUILTIN(CONFIG_IPV6) #define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__) #else #define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) #endif #define indirect_call_gro_receive_l4(f2, f1, cb, head, skb) \ ({ \ unlikely(gro_recursion_inc_test(skb)) ? \ NAPI_GRO_CB(skb)->flush |= 1, NULL : \ INDIRECT_CALL_L4(cb, f2, f1, head, skb); \ }) static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto) { const struct net_offload *ops = NULL; struct ipv6_opt_hdr *opth; for (;;) { int len; ops = rcu_dereference(inet6_offloads[proto]); if (unlikely(!ops)) break; if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) break; opth = skb_gro_header(skb, off + sizeof(*opth), off); if (unlikely(!opth)) break; len = ipv6_optlen(opth); opth = skb_gro_header(skb, off + len, off); if (unlikely(!opth)) break; proto = opth->nexthdr; off += len; } skb_gro_pull(skb, off - skb_gro_receive_network_offset(skb)); return proto; } static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto) { const struct net_offload *ops = NULL; for (;;) { struct ipv6_opt_hdr *opth; int len; ops = rcu_dereference(inet6_offloads[proto]); if (unlikely(!ops)) break; if (!(ops->flags & INET6_PROTO_GSO_EXTHDR)) break; if (unlikely(!pskb_may_pull(skb, 8))) break; opth = (void *)skb->data; len = ipv6_optlen(opth); if (unlikely(!pskb_may_pull(skb, len))) break; opth = (void *)skb->data; proto = opth->nexthdr; __skb_pull(skb, len); } return proto; } static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct ipv6hdr *ipv6h; const struct net_offload *ops; int proto, err; struct frag_hdr *fptr; unsigned int payload_len; u8 *prevhdr; int offset = 0; bool encap, udpfrag; int nhoff; bool gso_partial; skb_reset_network_header(skb); err = ipv6_hopopt_jumbo_remove(skb); if (err) return ERR_PTR(err); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; encap = SKB_GSO_CB(skb)->encap_level > 0; if (encap) features &= skb->dev->hw_enc_features; SKB_GSO_CB(skb)->encap_level += sizeof(*ipv6h); ipv6h = ipv6_hdr(skb); __skb_pull(skb, sizeof(*ipv6h)); segs = ERR_PTR(-EPROTONOSUPPORT); proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6)) udpfrag = proto == IPPROTO_UDP && encap && (skb_shinfo(skb)->gso_type & SKB_GSO_UDP); else udpfrag = proto == IPPROTO_UDP && !skb->encapsulation && (skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { skb_reset_transport_header(skb); segs = ops->callbacks.gso_segment(skb, features); if (!segs) skb->network_header = skb_mac_header(skb) + nhoff - skb->head; } if (IS_ERR_OR_NULL(segs)) goto out; gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); if (gso_partial && skb_is_gso(skb)) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); else payload_len = skb->len - nhoff - sizeof(*ipv6h); ipv6h->payload_len = htons(payload_len); skb->network_header = (u8 *)ipv6h - skb->head; skb_reset_mac_len(skb); if (udpfrag) { int err = ip6_find_1stfragopt(skb, &prevhdr); if (err < 0) { kfree_skb_list(segs); return ERR_PTR(err); } fptr = (struct frag_hdr *)((u8 *)ipv6h + err); fptr->frag_off = htons(offset); if (skb->next) fptr->frag_off |= htons(IP6_MF); offset += (ntohs(ipv6h->payload_len) - sizeof(struct frag_hdr)); } if (encap) skb_reset_inner_headers(skb); } out: return segs; } /* Return the total length of all the extension hdrs, following the same * logic in ipv6_gso_pull_exthdrs() when parsing ext-hdrs. */ static int ipv6_exthdrs_len(struct ipv6hdr *iph, const struct net_offload **opps) { struct ipv6_opt_hdr *opth = (void *)iph; int len = 0, proto, optlen = sizeof(*iph); proto = iph->nexthdr; for (;;) { *opps = rcu_dereference(inet6_offloads[proto]); if (unlikely(!(*opps))) break; if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR)) break; opth = (void *)opth + optlen; optlen = ipv6_optlen(opth); len += optlen; proto = opth->nexthdr; } return len; } INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; struct ipv6hdr *iph; unsigned int nlen; unsigned int hlen; unsigned int off; u16 flush = 1; int proto; off = skb_gro_offset(skb); hlen = off + sizeof(*iph); iph = skb_gro_header(skb, hlen, off); if (unlikely(!iph)) goto out; NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; flush += ntohs(iph->payload_len) != skb->len - hlen; proto = iph->nexthdr; ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) { proto = ipv6_gro_pull_exthdrs(skb, hlen, proto); ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out; iph = skb_gro_network_header(skb); } else { skb_gro_pull(skb, sizeof(*iph)); } skb_set_transport_header(skb, skb_gro_offset(skb)); NAPI_GRO_CB(skb)->proto = proto; flush--; nlen = skb_gro_offset(skb) - off; list_for_each_entry(p, head, list) { const struct ipv6hdr *iph2; __be32 first_word; /* <Version:4><Traffic_Class:8><Flow_Label:20> */ if (!NAPI_GRO_CB(p)->same_flow) continue; iph2 = (struct ipv6hdr *)(p->data + off); first_word = *(__be32 *)iph ^ *(__be32 *)iph2; /* All fields must match except length and Traffic Class. * XXX skbs on the gro_list have all been parsed and pulled * already so we don't need to compare nlen * (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops))) * memcmp() alone below is sufficient, right? */ if ((first_word & htonl(0xF00FFFFF)) || !ipv6_addr_equal(&iph->saddr, &iph2->saddr) || !ipv6_addr_equal(&iph->daddr, &iph2->daddr) || iph->nexthdr != iph2->nexthdr) { not_same_flow: NAPI_GRO_CB(p)->same_flow = 0; continue; } if (unlikely(nlen > sizeof(struct ipv6hdr))) { if (memcmp(iph + 1, iph2 + 1, nlen - sizeof(struct ipv6hdr))) goto not_same_flow; } } NAPI_GRO_CB(skb)->flush |= flush; skb_gro_postpull_rcsum(skb, iph, nlen); pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive, ops->callbacks.gro_receive, head, skb); out: skb_gro_flush_final(skb, pp, flush); return pp; } static struct sk_buff *sit_ip6ip6_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } NAPI_GRO_CB(skb)->encap_mark = 1; return ipv6_gro_receive(head, skb); } static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, struct sk_buff *skb) { /* Common GRO receive for SIT and IP6IP6 */ if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } NAPI_GRO_CB(skb)->encap_mark = 1; return inet_gro_receive(head, skb); } INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; struct ipv6hdr *iph; int err = -ENOSYS; u32 payload_len; if (skb->encapsulation) { skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6)); skb_set_inner_network_header(skb, nhoff); } payload_len = skb->len - nhoff - sizeof(*iph); if (unlikely(payload_len > IPV6_MAXPLEN)) { struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); /* Move network header left */ memmove(skb_mac_header(skb) - hoplen, skb_mac_header(skb), skb->transport_header - skb->mac_header); skb->data -= hoplen; skb->len += hoplen; skb->mac_header -= hoplen; skb->network_header -= hoplen; iph = (struct ipv6hdr *)(skb->data + nhoff); hop_jumbo = (struct hop_jumbo_hdr *)(iph + 1); /* Build hop-by-hop options */ hop_jumbo->nexthdr = iph->nexthdr; hop_jumbo->hdrlen = 0; hop_jumbo->tlv_type = IPV6_TLV_JUMBO; hop_jumbo->tlv_len = 4; hop_jumbo->jumbo_payload_len = htonl(payload_len + hoplen); iph->nexthdr = NEXTHDR_HOP; iph->payload_len = 0; } else { iph = (struct ipv6hdr *)(skb->data + nhoff); iph->payload_len = htons(payload_len); } nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete, udp6_gro_complete, skb, nhoff); out: return err; } static int sit_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; return ipv6_gro_complete(skb, nhoff); } static int ip6ip6_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; return ipv6_gro_complete(skb, nhoff); } static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; return inet_gro_complete(skb, nhoff); } static struct sk_buff *sit_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4)) return ERR_PTR(-EINVAL); return ipv6_gso_segment(skb, features); } static struct sk_buff *ip4ip6_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6)) return ERR_PTR(-EINVAL); return inet_gso_segment(skb, features); } static struct sk_buff *ip6ip6_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6)) return ERR_PTR(-EINVAL); return ipv6_gso_segment(skb, features); } static const struct net_offload sit_offload = { .callbacks = { .gso_segment = sit_gso_segment, .gro_receive = sit_ip6ip6_gro_receive, .gro_complete = sit_gro_complete, }, }; static const struct net_offload ip4ip6_offload = { .callbacks = { .gso_segment = ip4ip6_gso_segment, .gro_receive = ip4ip6_gro_receive, .gro_complete = ip4ip6_gro_complete, }, }; static const struct net_offload ip6ip6_offload = { .callbacks = { .gso_segment = ip6ip6_gso_segment, .gro_receive = sit_ip6ip6_gro_receive, .gro_complete = ip6ip6_gro_complete, }, }; static int __init ipv6_offload_init(void) { if (tcpv6_offload_init() < 0) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); if (ipv6_exthdrs_offload_init() < 0) pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__); net_hotdata.ipv6_packet_offload = (struct packet_offload) { .type = cpu_to_be16(ETH_P_IPV6), .callbacks = { .gso_segment = ipv6_gso_segment, .gro_receive = ipv6_gro_receive, .gro_complete = ipv6_gro_complete, }, }; dev_add_offload(&net_hotdata.ipv6_packet_offload); inet_add_offload(&sit_offload, IPPROTO_IPV6); inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6); inet6_add_offload(&ip4ip6_offload, IPPROTO_IPIP); return 0; } fs_initcall(ipv6_offload_init);
6 1 2 2 1 1 1 1 3 13 9 4 13 90 423 1 22 23 23 2 4 9 22 1 1 20 2 1 15 10 1 10 1 5 3 22 22 22 5 1 1 15 13 1 18 1 9 1 3 10 12 515 517 513 93 422 48 1 1 2 1 1 1 1 1 1 1 1 2 1 2 2 2 1 3 1 2 1 2 1 1 1 2 1 1 1 1 4 1 1 1 1 1 1 1 1 1 325 89 289 1 1 1 5 3 3 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 3 1 1 1 1 5 3 1 1 1 1 1 13 6 11 22 1 2 3 31 19 1 320 90 554 10 31 317 198 5 1 1 1 1 1 2 2 9 1 8 1 4 3 103 105 7 97 24 4 2 1 9 1 4 1 5 4 1 4 1 4 1 4 1 1 1 1 1 1 1 1 1 1 1 5 1 1 1 1 1 1 1 1 1 1 3 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 2 1 1 2 1 1 1 1 41 41 127 3 19 83 17 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 BSD socket options interface * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/net/ipv4/ip_sockglue.c * * FIXME: Make the setsockopt code POSIX compliant: That is * * o Truncate getsockopt returns * o Return an optlen of the truncated length if need be * * Changes: * David L Stevens <dlstevens@us.ibm.com>: * - added multicast source filtering API for MLDv2 */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/mroute6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/xfrm.h> #include <net/compat.h> #include <net/seg6.h> #include <linux/uaccess.h> struct ip6_ra_chain *ip6_ra_chain; DEFINE_RWLOCK(ip6_ra_lock); DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount); int ip6_ra_control(struct sock *sk, int sel) { struct ip6_ra_chain *ra, *new_ra, **rap; /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) return -ENOPROTOOPT; new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; if (sel >= 0 && !new_ra) return -ENOMEM; write_lock_bh(&ip6_ra_lock); for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (sel >= 0) { write_unlock_bh(&ip6_ra_lock); kfree(new_ra); return -EADDRINUSE; } *rap = ra->next; write_unlock_bh(&ip6_ra_lock); sock_put(sk); kfree(ra); return 0; } } if (!new_ra) { write_unlock_bh(&ip6_ra_lock); return -ENOBUFS; } new_ra->sk = sk; new_ra->sel = sel; new_ra->next = ra; *rap = new_ra; sock_hold(sk); write_unlock_bh(&ip6_ra_lock); return 0; } struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { if (inet_test_bit(IS_ICSK, sk)) { if (opt && !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = unrcu_pointer(xchg(&inet6_sk(sk)->opt, RCU_INITIALIZER(opt))); sk_dst_reset(sk); return opt; } static bool setsockopt_needs_rtnl(int optname) { switch (optname) { case IPV6_ADDRFORM: case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_MSFILTER: return true; } return false; } static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { if (in_compat_syscall()) { struct compat_group_source_req gr32; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; greqs->gsr_interface = gr32.gsr_interface; greqs->gsr_group = gr32.gsr_group; greqs->gsr_source = gr32.gsr_source; } else { if (optlen < sizeof(*greqs)) return -EINVAL; if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) return -EFAULT; } return 0; } static int do_ipv6_mcast_group_source(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct group_source_req greqs; int omode, add; int ret; ret = copy_group_source_from_sockptr(&greqs, optval, optlen); if (ret) return ret; if (greqs.gsr_group.ss_family != AF_INET6 || greqs.gsr_source.ss_family != AF_INET6) return -EADDRNOTAVAIL; if (optname == MCAST_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == MCAST_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct sockaddr_in6 *psin6; int retv; psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, &psin6->sin6_addr, MCAST_INCLUDE); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) return retv; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } return ip6_mc_source(add, omode, sk, &greqs); } static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { struct group_filter *gsf; int ret; if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); if (IS_ERR(gsf)) return PTR_ERR(gsf); /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffffU || gsf->gf_numsrc > sysctl_mld_max_msf) goto out_free_gsf; ret = -EINVAL; if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return ret; } static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; void *p; int ret; int n; if (optlen < size0) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ ret = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_p; /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; n = gf32->gf_numsrc; if (n >= 0x1ffffffU || n > sysctl_mld_max_msf) goto out_free_p; ret = -EINVAL; if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_p; ret = ip6_mc_msfilter(sk, &(struct group_filter){ .gf_interface = gf32->gf_interface, .gf_group = gf32->gf_group, .gf_fmode = gf32->gf_fmode, .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex); out_free_p: kfree(p); return ret; } static int ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct sockaddr_in6 *psin6; struct group_req greq; if (optlen < sizeof(greq)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; if (greq.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&greq.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, greq.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr); } static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct compat_group_req gr32; struct sockaddr_in6 *psin6; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; if (gr32.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&gr32.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, gr32.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr); } static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_opt_hdr *new = NULL; struct net *net = sock_net(sk); struct ipv6_txoptions *opt; int err; /* hop-by-hop / destination options are privileged option */ if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; /* remove any sticky options header with a zero option * length, per RFC3542. */ if (optlen > 0) { if (sockptr_is_null(optval)) return -EINVAL; if (optlen < sizeof(struct ipv6_opt_hdr) || optlen & 0x7 || optlen > 8 * 255) return -EINVAL; new = memdup_sockptr(optval, optlen); if (IS_ERR(new)) return PTR_ERR(new); if (unlikely(ipv6_optlen(new) > optlen)) { kfree(new); return -EINVAL; } } opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); opt = ipv6_renew_options(sk, opt, optname, new); kfree(new); if (IS_ERR(opt)) return PTR_ERR(opt); /* routing header option needs extra check */ err = -EINVAL; if (optname == IPV6_RTHDR && opt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = opt->srcrt; switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) goto sticky_done; break; #endif case IPV6_SRCRT_TYPE_4: { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; if (!seg6_validate_srh(srh, optlen, false)) goto sticky_done; break; } default: goto sticky_done; } } err = 0; opt = ipv6_update_options(sk, opt); sticky_done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } return err; } int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int val, valbool; int retv = -ENOPROTOOPT; bool needs_rtnl = setsockopt_needs_rtnl(optname); if (sockptr_is_null(optval)) val = 0; else { if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else val = 0; } valbool = (val != 0); if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); /* Handle options that can be set without locking the socket. */ switch (optname) { case IPV6_UNICAST_HOPS: if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->hop_limit, val); return 0; case IPV6_MULTICAST_LOOP: if (optlen < sizeof(int)) return -EINVAL; if (val != valbool) return -EINVAL; inet6_assign_bit(MC6_LOOP, sk, valbool); return 0; case IPV6_MULTICAST_HOPS: if (sk->sk_type == SOCK_STREAM) return retv; if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->mcast_hops, val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); return 0; case IPV6_MTU: if (optlen < sizeof(int)) return -EINVAL; if (val && val < IPV6_MIN_MTU) return -EINVAL; WRITE_ONCE(np->frag_size, val); return 0; case IPV6_MINHOPCOUNT: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 255) return -EINVAL; if (val) static_branch_enable(&ip6_min_hopcount); /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount * while we are changing it. */ WRITE_ONCE(np->min_hopcount, val); return 0; case IPV6_RECVERR_RFC4884: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 1) return -EINVAL; inet6_assign_bit(RECVERR6_RFC4884, sk, valbool); return 0; case IPV6_MULTICAST_ALL: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(MC6_ALL, sk, valbool); return 0; case IPV6_AUTOFLOWLABEL: inet6_assign_bit(AUTOFLOWLABEL, sk, valbool); inet6_set_bit(AUTOFLOWLABEL_SET, sk); return 0; case IPV6_DONTFRAG: inet6_assign_bit(DONTFRAG, sk, valbool); return 0; case IPV6_RECVERR: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RECVERR6, sk, valbool); if (!val) skb_errqueue_purge(&sk->sk_error_queue); return 0; case IPV6_ROUTER_ALERT_ISOLATE: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RTALERT_ISOLATE, sk, valbool); return 0; case IPV6_MTU_DISCOVER: if (optlen < sizeof(int)) return -EINVAL; if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) return -EINVAL; WRITE_ONCE(np->pmtudisc, val); return 0; case IPV6_FLOWINFO_SEND: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(SNDFLOW, sk, valbool); return 0; case IPV6_ADDR_PREFERENCES: if (optlen < sizeof(int)) return -EINVAL; return ip6_sock_set_addr_preferences(sk, val); case IPV6_MULTICAST_IF: if (sk->sk_type == SOCK_STREAM) return -ENOPROTOOPT; if (optlen < sizeof(int)) return -EINVAL; if (val) { struct net_device *dev; int bound_dev_if, midx; rcu_read_lock(); dev = dev_get_by_index_rcu(net, val); if (!dev) { rcu_read_unlock(); return -ENODEV; } midx = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && bound_dev_if != val && (!midx || midx != bound_dev_if)) return -EINVAL; } WRITE_ONCE(np->mcast_oif, val); return 0; case IPV6_UNICAST_IF: { struct net_device *dev; int ifindex; if (optlen != sizeof(int)) return -EINVAL; ifindex = (__force int)ntohl((__force __be32)val); if (!ifindex) { WRITE_ONCE(np->ucast_oif, 0); return 0; } dev = dev_get_by_index(net, ifindex); if (!dev) return -EADDRNOTAVAIL; dev_put(dev); if (READ_ONCE(sk->sk_bound_dev_if)) return -EINVAL; WRITE_ONCE(np->ucast_oif, ifindex); return 0; } } if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); /* Another thread has converted the socket into IPv4 with * IPV6_ADDRFORM concurrently. */ if (unlikely(sk->sk_family != AF_INET6)) goto unlock; switch (optname) { case IPV6_ADDRFORM: if (optlen < sizeof(int)) goto e_inval; if (val == PF_INET) { if (sk->sk_type == SOCK_RAW) break; if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) { struct udp_sock *up = udp_sk(sk); if (up->pending == AF_INET6) { retv = -EBUSY; break; } } else if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_prot != &tcpv6_prot) { retv = -EBUSY; break; } } else { break; } if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; } if (ipv6_only_sock(sk) || !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { retv = -EADDRNOTAVAIL; break; } __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); if (sk->sk_protocol == IPPROTO_TCP) { struct inet_connection_sock *icsk = inet_csk(sk); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops); WRITE_ONCE(sk->sk_family, PF_INET); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct proto *prot = &udp_prot; if (sk->sk_protocol == IPPROTO_UDPLITE) prot = &udplite_prot; sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops); WRITE_ONCE(sk->sk_family, PF_INET); } /* Disable all options not to allocate memory anymore, * but there is still a race. See the lockless path * in udpv6_sendmsg() and ipv6_local_rxpmtu(). */ np->rxopt.all = 0; inet6_cleanup_sock(sk); module_put(THIS_MODULE); retv = 0; break; } goto e_inval; case IPV6_V6ONLY: if (optlen < sizeof(int) || inet_sk(sk)->inet_num) goto e_inval; sk->sk_ipv6only = valbool; retv = 0; break; case IPV6_RECVPKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxinfo = valbool; retv = 0; break; case IPV6_2292PKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxoinfo = valbool; retv = 0; break; case IPV6_RECVHOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxhlim = valbool; retv = 0; break; case IPV6_2292HOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxohlim = valbool; retv = 0; break; case IPV6_RECVRTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.srcrt = valbool; retv = 0; break; case IPV6_2292RTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.osrcrt = valbool; retv = 0; break; case IPV6_RECVHOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.hopopts = valbool; retv = 0; break; case IPV6_2292HOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.ohopopts = valbool; retv = 0; break; case IPV6_RECVDSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.dstopts = valbool; retv = 0; break; case IPV6_2292DSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.odstopts = valbool; retv = 0; break; case IPV6_TCLASS: if (optlen < sizeof(int)) goto e_inval; if (val < -1 || val > 0xff) goto e_inval; /* RFC 3542, 6.5: default traffic class of 0x0 */ if (val == -1) val = 0; if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; val |= np->tclass & INET_ECN_MASK; } if (np->tclass != val) { np->tclass = val; sk_dst_reset(sk); } retv = 0; break; case IPV6_RECVTCLASS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxtclass = valbool; retv = 0; break; case IPV6_FLOWINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxflow = valbool; retv = 0; break; case IPV6_RECVPATHMTU: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxpmtu = valbool; retv = 0; break; case IPV6_TRANSPARENT: if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) { retv = -EPERM; break; } if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ inet_assign_bit(TRANSPARENT, sk, valbool); retv = 0; break; case IPV6_FREEBIND: if (optlen < sizeof(int)) goto e_inval; /* we also don't have a separate freebind bit for IPV6 */ inet_assign_bit(FREEBIND, sk, valbool); retv = 0; break; case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxorigdstaddr = valbool; retv = 0; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: retv = ipv6_set_opt_hdr(sk, optname, optval, optlen); break; case IPV6_PKTINFO: { struct in6_pktinfo pkt; if (optlen == 0) goto e_inval; else if (optlen < sizeof(struct in6_pktinfo) || sockptr_is_null(optval)) goto e_inval; if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) { retv = -EFAULT; break; } if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr; retv = 0; break; } case IPV6_2292PKTOPTIONS: { struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; if (optlen == 0) goto update; /* 1K is probably excessive * 1K is surely not enough, 2K per standard header is 16K. */ retv = -EINVAL; if (optlen > 64*1024) break; opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); retv = -ENOBUFS; if (!opt) break; memset(opt, 0, sizeof(*opt)); refcount_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_sockptr(opt + 1, optval, optlen)) goto done; msg.msg_controllen = optlen; msg.msg_control_is_user = false; msg.msg_control = (void *)(opt+1); ipc6.opt = opt; retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6); if (retv) goto done; update: retv = 0; opt = ipv6_update_options(sk, opt); done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } break; } case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EPROTO; if (inet_test_bit(IS_ICSK, sk)) break; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_ADD_MEMBERSHIP) retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); else retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); break; } case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_JOIN_ANYCAST) retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); else retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: if (in_compat_syscall()) retv = compat_ipv6_mcast_join_leave(sk, optname, optval, optlen); else retv = ipv6_mcast_join_leave(sk, optname, optval, optlen); break; case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: retv = do_ipv6_mcast_group_source(sk, optname, optval, optlen); break; case MCAST_MSFILTER: if (in_compat_syscall()) retv = compat_ipv6_set_mcast_msfilter(sk, optval, optlen); else retv = ipv6_set_mcast_msfilter(sk, optval, optlen); break; case IPV6_ROUTER_ALERT: if (optlen < sizeof(int)) goto e_inval; retv = ip6_ra_control(sk, val); if (retv == 0) inet6_assign_bit(RTALERT, sk, valbool); break; case IPV6_FLOWLABEL_MGR: retv = ipv6_flowlabel_opt(sk, optval, optlen); break; case IPV6_IPSEC_POLICY: case IPV6_XFRM_POLICY: retv = -EPERM; if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) break; retv = xfrm_user_policy(sk, optname, optval, optlen); break; case IPV6_RECVFRAGSIZE: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; } unlock: sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return retv; e_inval: retv = -EINVAL; goto unlock; } int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && optname != IPV6_XFRM_POLICY) err = nf_setsockopt(sk, PF_INET6, optname, optval, optlen); #endif return err; } EXPORT_SYMBOL(ipv6_setsockopt); static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, int optname, sockptr_t optval, int len) { struct ipv6_opt_hdr *hdr; if (!opt) return 0; switch (optname) { case IPV6_HOPOPTS: hdr = opt->hopopt; break; case IPV6_RTHDRDSTOPTS: hdr = opt->dst0opt; break; case IPV6_RTHDR: hdr = (struct ipv6_opt_hdr *)opt->srcrt; break; case IPV6_DSTOPTS: hdr = opt->dst1opt; break; default: return -EINVAL; /* should not happen */ } if (!hdr) return 0; len = min_t(unsigned int, len, ipv6_optlen(hdr)); if (copy_to_sockptr(optval, hdr, len)) return -EFAULT; return len; } static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter gsf; int num; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; if (gsf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; num = gsf.gf_numsrc; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gsf, optval, size0); if (!err) { if (num > gsf.gf_numsrc) num = gsf.gf_numsrc; len = GROUP_FILTER_SIZE(num); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr(optval, &gsf, size0)) err = -EFAULT; } sockopt_release_sock(sk); return err; } static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter gf32; struct group_filter gf; int err; int num; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; gf.gf_fmode = gf32.gf_fmode; num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; if (gf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gf, optval, size0); sockopt_release_sock(sk); if (err) return err; if (num > gf.gf_numsrc) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), &gf.gf_fmode, sizeof(gf32.gf_fmode)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), &gf.gf_numsrc, sizeof(gf32.gf_numsrc))) return -EFAULT; return 0; } int do_ipv6_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct ipv6_pinfo *np = inet6_sk(sk); int len; int val; if (ip6_mroute_opt(optname)) return ip6_mroute_getsockopt(sk, optname, optval, optlen); if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; switch (optname) { case IPV6_ADDRFORM: if (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_UDPLITE && sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; val = sk->sk_family; break; case MCAST_MSFILTER: if (in_compat_syscall()) return compat_ipv6_get_msfilter(sk, optval, optlen, len); return ipv6_get_msfilter(sk, optval, optlen, len); case IPV6_2292PKTOPTIONS: { struct msghdr msg; struct sk_buff *skb; if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; if (optval.is_kernel) { msg.msg_control_is_user = false; msg.msg_control = optval.kernel; } else { msg.msg_control_is_user = true; msg.msg_control_user = optval.user; } msg.msg_controllen = len; msg.msg_flags = 0; sockopt_lock_sock(sk); skb = np->pktoptions; if (skb) ip6_datagram_recv_ctl(sk, &msg, skb); sockopt_release_sock(sk); if (!skb) { if (np->rxopt.bits.rxinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { int tclass = (int)ip6_tclass(np->rcv_flowinfo); put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxoinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxflow) { __be32 flowinfo = np->rcv_flowinfo; put_cmsg(&msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } } len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_MTU: { struct dst_entry *dst; val = 0; rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = dst_mtu(dst); rcu_read_unlock(); if (!val) return -ENOTCONN; break; } case IPV6_V6ONLY: val = sk->sk_ipv6only; break; case IPV6_RECVPKTINFO: val = np->rxopt.bits.rxinfo; break; case IPV6_2292PKTINFO: val = np->rxopt.bits.rxoinfo; break; case IPV6_RECVHOPLIMIT: val = np->rxopt.bits.rxhlim; break; case IPV6_2292HOPLIMIT: val = np->rxopt.bits.rxohlim; break; case IPV6_RECVRTHDR: val = np->rxopt.bits.srcrt; break; case IPV6_2292RTHDR: val = np->rxopt.bits.osrcrt; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; sockopt_lock_sock(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); sockopt_release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) return len; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_RECVHOPOPTS: val = np->rxopt.bits.hopopts; break; case IPV6_2292HOPOPTS: val = np->rxopt.bits.ohopopts; break; case IPV6_RECVDSTOPTS: val = np->rxopt.bits.dstopts; break; case IPV6_2292DSTOPTS: val = np->rxopt.bits.odstopts; break; case IPV6_TCLASS: val = np->tclass; break; case IPV6_RECVTCLASS: val = np->rxopt.bits.rxtclass; break; case IPV6_FLOWINFO: val = np->rxopt.bits.rxflow; break; case IPV6_RECVPATHMTU: val = np->rxopt.bits.rxpmtu; break; case IPV6_PATHMTU: { struct dst_entry *dst; struct ip6_mtuinfo mtuinfo; if (len < sizeof(mtuinfo)) return -EINVAL; len = sizeof(mtuinfo); memset(&mtuinfo, 0, sizeof(mtuinfo)); rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) mtuinfo.ip6m_mtu = dst_mtu(dst); rcu_read_unlock(); if (!mtuinfo.ip6m_mtu) return -ENOTCONN; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &mtuinfo, len)) return -EFAULT; return 0; } case IPV6_TRANSPARENT: val = inet_test_bit(TRANSPARENT, sk); break; case IPV6_FREEBIND: val = inet_test_bit(FREEBIND, sk); break; case IPV6_RECVORIGDSTADDR: val = np->rxopt.bits.rxorigdstaddr; break; case IPV6_UNICAST_HOPS: case IPV6_MULTICAST_HOPS: { struct dst_entry *dst; if (optname == IPV6_UNICAST_HOPS) val = READ_ONCE(np->hop_limit); else val = READ_ONCE(np->mcast_hops); if (val < 0) { rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = ip6_dst_hoplimit(dst); rcu_read_unlock(); } if (val < 0) val = READ_ONCE(sock_net(sk)->ipv6.devconf_all->hop_limit); break; } case IPV6_MULTICAST_LOOP: val = inet6_test_bit(MC6_LOOP, sk); break; case IPV6_MULTICAST_IF: val = READ_ONCE(np->mcast_oif); break; case IPV6_MULTICAST_ALL: val = inet6_test_bit(MC6_ALL, sk); break; case IPV6_UNICAST_IF: val = (__force int)htonl((__u32) READ_ONCE(np->ucast_oif)); break; case IPV6_MTU_DISCOVER: val = READ_ONCE(np->pmtudisc); break; case IPV6_RECVERR: val = inet6_test_bit(RECVERR6, sk); break; case IPV6_FLOWINFO_SEND: val = inet6_test_bit(SNDFLOW, sk); break; case IPV6_FLOWLABEL_MGR: { struct in6_flowlabel_req freq; int flags; if (len < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; if (freq.flr_action != IPV6_FL_A_GET) return -EINVAL; len = sizeof(freq); flags = freq.flr_flags; memset(&freq, 0, sizeof(freq)); val = ipv6_flowlabel_opt_get(sk, &freq, flags); if (val < 0) return val; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &freq, len)) return -EFAULT; return 0; } case IPV6_ADDR_PREFERENCES: { u8 srcprefs = READ_ONCE(np->srcprefs); val = 0; if (srcprefs & IPV6_PREFER_SRC_TMP) val |= IPV6_PREFER_SRC_TMP; else if (srcprefs & IPV6_PREFER_SRC_PUBLIC) val |= IPV6_PREFER_SRC_PUBLIC; else { /* XXX: should we return system default? */ val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT; } if (srcprefs & IPV6_PREFER_SRC_COA) val |= IPV6_PREFER_SRC_COA; else val |= IPV6_PREFER_SRC_HOME; break; } case IPV6_MINHOPCOUNT: val = READ_ONCE(np->min_hopcount); break; case IPV6_DONTFRAG: val = inet6_test_bit(DONTFRAG, sk); break; case IPV6_AUTOFLOWLABEL: val = ip6_autoflowlabel(sock_net(sk), sk); break; case IPV6_RECVFRAGSIZE: val = np->rxopt.bits.recvfragsize; break; case IPV6_ROUTER_ALERT: val = inet6_test_bit(RTALERT, sk); break; case IPV6_ROUTER_ALERT_ISOLATE: val = inet6_test_bit(RTALERT_ISOLATE, sk); break; case IPV6_RECVERR_RFC4884: val = inet6_test_bit(RECVERR6_RFC4884, sk); break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return ip_getsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { int len; if (get_user(len, optlen)) return -EFAULT; err = nf_getsockopt(sk, PF_INET6, optname, optval, &len); if (err >= 0) err = put_user(len, optlen); } #endif return err; } EXPORT_SYMBOL(ipv6_getsockopt);
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 #ifndef _TCP_DCTCP_H #define _TCP_DCTCP_H static inline void dctcp_ece_ack_cwr(struct sock *sk, u32 ce_state) { struct tcp_sock *tp = tcp_sk(sk); if (ce_state == 1) tp->ecn_flags |= TCP_ECN_DEMAND_CWR; else tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } /* Minimal DCTP CE state machine: * * S: 0 <- last pkt was non-CE * 1 <- last pkt was CE */ static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, u32 *prior_rcv_nxt, u32 *ce_state) { u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0; if (*ce_state != new_ce_state) { /* CE state has changed, force an immediate ACK to * reflect the new CE state. If an ACK was delayed, * send that first to reflect the prior CE state. */ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { dctcp_ece_ack_cwr(sk, *ce_state); __tcp_send_ack(sk, *prior_rcv_nxt); } inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt; *ce_state = new_ce_state; dctcp_ece_ack_cwr(sk, new_ce_state); } #endif
8 1 7 1 1 3 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2013 Eric Leblond <eric@regit.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_reject.h> #include <linux/icmp.h> #include <linux/icmpv6.h> const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { [NFTA_REJECT_TYPE] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, }; EXPORT_SYMBOL_GPL(nft_reject_policy); int nft_reject_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_PRE_ROUTING)); } EXPORT_SYMBOL_GPL(nft_reject_validate); int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); int icmp_code; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: case NFT_REJECT_ICMPX_UNREACH: if (tb[NFTA_REJECT_ICMP_CODE] == NULL) return -EINVAL; icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); if (priv->type == NFT_REJECT_ICMPX_UNREACH && icmp_code > NFT_REJECT_ICMPX_MAX) return -EINVAL; priv->icmp_code = icmp_code; break; case NFT_REJECT_TCP_RST: break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL_GPL(nft_reject_init); int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_reject *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) goto nla_put_failure; switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: case NFT_REJECT_ICMPX_UNREACH: if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } EXPORT_SYMBOL_GPL(nft_reject_dump); static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMP_NET_UNREACH, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMP_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMP_HOST_UNREACH, [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMP_PKT_FILTERED, }; int nft_reject_icmp_code(u8 code) { if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) return ICMP_NET_UNREACH; return icmp_code_v4[code]; } EXPORT_SYMBOL_GPL(nft_reject_icmp_code); static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMPV6_NOROUTE, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMPV6_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMPV6_ADDR_UNREACH, [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMPV6_ADM_PROHIBITED, }; int nft_reject_icmpv6_code(u8 code) { if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) return ICMPV6_NOROUTE; return icmp_code_v6[code]; } EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Netfilter x_tables over nftables module");
1 5 1 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #ifndef _NET_BATMAN_ADV_ORIGINATOR_H_ #define _NET_BATMAN_ADV_ORIGINATOR_H_ #include "main.h" #include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/jhash.h> #include <linux/kref.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> bool batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); void batadv_orig_node_release(struct kref *ref); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); void batadv_hardif_neigh_release(struct kref *ref); struct batadv_neigh_node * batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); void batadv_neigh_node_release(struct kref *ref); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, const struct batadv_hard_iface *if_outgoing); struct batadv_neigh_node * batadv_orig_to_router(struct batadv_priv *bat_priv, u8 *orig_addr, struct batadv_hard_iface *if_outgoing); struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); void batadv_neigh_ifinfo_release(struct kref *ref); int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb); struct batadv_orig_ifinfo * batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); struct batadv_orig_ifinfo * batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); void batadv_orig_ifinfo_release(struct kref *ref); int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); struct batadv_orig_node_vlan * batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, unsigned short vid); struct batadv_orig_node_vlan * batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, unsigned short vid); void batadv_orig_node_vlan_release(struct kref *ref); /** * batadv_choose_orig() - Return the index of the orig entry in the hash table * @data: mac address of the originator node * @size: the size of the hash table * * Return: the hash index where the object represented by @data should be * stored at. */ static inline u32 batadv_choose_orig(const void *data, u32 size) { u32 hash = 0; hash = jhash(data, ETH_ALEN, hash); return hash % size; } struct batadv_orig_node * batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data); /** * batadv_orig_node_vlan_put() - decrement the refcounter and possibly release * the originator-vlan object * @orig_vlan: the originator-vlan object to release */ static inline void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan) { if (!orig_vlan) return; kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release); } /** * batadv_neigh_ifinfo_put() - decrement the refcounter and possibly release * the neigh_ifinfo * @neigh_ifinfo: the neigh_ifinfo object to release */ static inline void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo) { if (!neigh_ifinfo) return; kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release); } /** * batadv_hardif_neigh_put() - decrement the hardif neighbors refcounter * and possibly release it * @hardif_neigh: hardif neigh neighbor to free */ static inline void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh) { if (!hardif_neigh) return; kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release); } /** * batadv_neigh_node_put() - decrement the neighbors refcounter and possibly * release it * @neigh_node: neigh neighbor to free */ static inline void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) { if (!neigh_node) return; kref_put(&neigh_node->refcount, batadv_neigh_node_release); } /** * batadv_orig_ifinfo_put() - decrement the refcounter and possibly release * the orig_ifinfo * @orig_ifinfo: the orig_ifinfo object to release */ static inline void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo) { if (!orig_ifinfo) return; kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release); } /** * batadv_orig_node_put() - decrement the orig node refcounter and possibly * release it * @orig_node: the orig node to free */ static inline void batadv_orig_node_put(struct batadv_orig_node *orig_node) { if (!orig_node) return; kref_put(&orig_node->refcount, batadv_orig_node_release); } #endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM dccp #if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_DCCP_H #include <net/sock.h> #include "dccp.h" #include "ccids/ccid3.h" #include <linux/tracepoint.h> #include <trace/events/net_probe_common.h> TRACE_EVENT(dccp_probe, TP_PROTO(struct sock *sk, size_t size), TP_ARGS(sk, size), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(__u16, sport) __field(__u16, dport) __field(__u16, size) __field(__u16, tx_s) __field(__u32, tx_rtt) __field(__u32, tx_p) __field(__u32, tx_x_calc) __field(__u64, tx_x_recv) __field(__u64, tx_x) __field(__u32, tx_t_ipi) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); struct ccid3_hc_tx_sock *hc = NULL; if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) hc = ccid3_hc_tx_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); /* For filtering use */ __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->size = size; if (hc) { __entry->tx_s = hc->tx_s; __entry->tx_rtt = hc->tx_rtt; __entry->tx_p = hc->tx_p; __entry->tx_x_calc = hc->tx_x_calc; __entry->tx_x_recv = hc->tx_x_recv >> 6; __entry->tx_x = hc->tx_x >> 6; __entry->tx_t_ipi = hc->tx_t_ipi; } else { __entry->tx_s = 0; memset_startat(__entry, 0, tx_rtt); } ), TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d " "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d", __entry->saddr, __entry->daddr, __entry->size, __entry->tx_s, __entry->tx_rtt, __entry->tx_p, __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x, __entry->tx_t_ipi) ); #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0 /* * tracing clocks * * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * * Implements 3 trace clock variants, with differing scalability/precision * tradeoffs: * * - local: CPU-local trace clock * - medium: scalable global clock with some jitter * - global: globally monotonic, serialized clock * * Tracer plugins will chose a default from these clocks. */ #include <linux/spinlock.h> #include <linux/irqflags.h> #include <linux/hardirq.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/ktime.h> #include <linux/trace_clock.h> /* * trace_clock_local(): the simplest and least coherent tracing clock. * * Useful for tracing that does not cross to other CPUs nor * does it go through idle events. */ u64 notrace trace_clock_local(void) { u64 clock; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ preempt_disable_notrace(); clock = sched_clock(); preempt_enable_notrace(); return clock; } EXPORT_SYMBOL_GPL(trace_clock_local); /* * trace_clock(): 'between' trace clock. Not completely serialized, * but not completely incorrect when crossing CPUs either. * * This is based on cpu_clock(), which will allow at most ~1 jiffy of * jitter between CPUs. So it's a pretty scalable clock, but there * can be offsets in the trace data. */ u64 notrace trace_clock(void) { return local_clock(); } EXPORT_SYMBOL_GPL(trace_clock); /* * trace_jiffy_clock(): Simply use jiffies as a clock counter. * Note that this use of jiffies_64 is not completely safe on * 32-bit systems. But the window is tiny, and the effect if * we are affected is that we will have an obviously bogus * timestamp on a trace event - i.e. not life threatening. */ u64 notrace trace_clock_jiffies(void) { return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); } EXPORT_SYMBOL_GPL(trace_clock_jiffies); /* * trace_clock_global(): special globally coherent trace clock * * It has higher overhead than the other trace clocks but is still * an order of magnitude faster than GTOD derived hardware clocks. * * Used by plugins that need globally coherent timestamps. */ /* keep prev_time and lock in the same cacheline. */ static struct { u64 prev_time; arch_spinlock_t lock; } trace_clock_struct ____cacheline_aligned_in_smp = { .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, }; u64 notrace trace_clock_global(void) { unsigned long flags; int this_cpu; u64 now, prev_time; raw_local_irq_save(flags); this_cpu = raw_smp_processor_id(); /* * The global clock "guarantees" that the events are ordered * between CPUs. But if two events on two different CPUS call * trace_clock_global at roughly the same time, it really does * not matter which one gets the earlier time. Just make sure * that the same CPU will always show a monotonic clock. * * Use a read memory barrier to get the latest written * time that was recorded. */ smp_rmb(); prev_time = READ_ONCE(trace_clock_struct.prev_time); now = sched_clock_cpu(this_cpu); /* Make sure that now is always greater than or equal to prev_time */ if ((s64)(now - prev_time) < 0) now = prev_time; /* * If in an NMI context then dont risk lockups and simply return * the current time. */ if (unlikely(in_nmi())) goto out; /* Tracing can cause strange recursion, always use a try lock */ if (arch_spin_trylock(&trace_clock_struct.lock)) { /* Reread prev_time in case it was already updated */ prev_time = READ_ONCE(trace_clock_struct.prev_time); if ((s64)(now - prev_time) < 0) now = prev_time; trace_clock_struct.prev_time = now; /* The unlock acts as the wmb for the above rmb */ arch_spin_unlock(&trace_clock_struct.lock); } out: raw_local_irq_restore(flags); return now; } EXPORT_SYMBOL_GPL(trace_clock_global); static atomic64_t trace_counter; /* * trace_clock_counter(): simply an atomic counter. * Use the trace_counter "counter" for cases where you do not care * about timings, but are interested in strict ordering. */ u64 notrace trace_clock_counter(void) { return atomic64_inc_return(&trace_counter); }
660 12 644 377 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef MM_SLAB_H #define MM_SLAB_H #include <linux/reciprocal_div.h> #include <linux/list_lru.h> #include <linux/local_lock.h> #include <linux/random.h> #include <linux/kobject.h> #include <linux/sched/mm.h> #include <linux/memcontrol.h> #include <linux/kfence.h> #include <linux/kasan.h> /* * Internal slab definitions */ #ifdef CONFIG_64BIT # ifdef system_has_cmpxchg128 # define system_has_freelist_aba() system_has_cmpxchg128() # define try_cmpxchg_freelist try_cmpxchg128 # endif #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128 typedef u128 freelist_full_t; #else /* CONFIG_64BIT */ # ifdef system_has_cmpxchg64 # define system_has_freelist_aba() system_has_cmpxchg64() # define try_cmpxchg_freelist try_cmpxchg64 # endif #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64 typedef u64 freelist_full_t; #endif /* CONFIG_64BIT */ #if defined(system_has_freelist_aba) && !defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) #undef system_has_freelist_aba #endif /* * Freelist pointer and counter to cmpxchg together, avoids the typical ABA * problems with cmpxchg of just a pointer. */ typedef union { struct { void *freelist; unsigned long counter; }; freelist_full_t full; } freelist_aba_t; /* Reuses the bits in struct page */ struct slab { unsigned long __page_flags; struct kmem_cache *slab_cache; union { struct { union { struct list_head slab_list; #ifdef CONFIG_SLUB_CPU_PARTIAL struct { struct slab *next; int slabs; /* Nr of slabs left */ }; #endif }; /* Double-word boundary */ union { struct { void *freelist; /* first free object */ union { unsigned long counters; struct { unsigned inuse:16; unsigned objects:15; /* * If slab debugging is enabled then the * frozen bit can be reused to indicate * that the slab was corrupted */ unsigned frozen:1; }; }; }; #ifdef system_has_freelist_aba freelist_aba_t freelist_counter; #endif }; }; struct rcu_head rcu_head; }; unsigned int __page_type; atomic_t __page_refcount; #ifdef CONFIG_SLAB_OBJ_EXT unsigned long obj_exts; #endif }; #define SLAB_MATCH(pg, sl) \ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) SLAB_MATCH(flags, __page_flags); SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ SLAB_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG SLAB_MATCH(memcg_data, obj_exts); #elif defined(CONFIG_SLAB_OBJ_EXT) SLAB_MATCH(_unused_slab_obj_exts, obj_exts); #endif #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); #if defined(system_has_freelist_aba) static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t))); #endif /** * folio_slab - Converts from folio to slab. * @folio: The folio. * * Currently struct slab is a different representation of a folio where * folio_test_slab() is true. * * Return: The slab which contains this folio. */ #define folio_slab(folio) (_Generic((folio), \ const struct folio *: (const struct slab *)(folio), \ struct folio *: (struct slab *)(folio))) /** * slab_folio - The folio allocated for a slab * @slab: The slab. * * Slabs are allocated as folios that contain the individual objects and are * using some fields in the first struct page of the folio - those fields are * now accessed by struct slab. It is occasionally necessary to convert back to * a folio in order to communicate with the rest of the mm. Please use this * helper function instead of casting yourself, as the implementation may change * in the future. */ #define slab_folio(s) (_Generic((s), \ const struct slab *: (const struct folio *)s, \ struct slab *: (struct folio *)s)) /** * page_slab - Converts from first struct page to slab. * @p: The first (either head of compound or single) page of slab. * * A temporary wrapper to convert struct page to struct slab in situations where * we know the page is the compound head, or single order-0 page. * * Long-term ideally everything would work with struct slab directly or go * through folio to struct slab. * * Return: The slab which contains this page */ #define page_slab(p) (_Generic((p), \ const struct page *: (const struct slab *)(p), \ struct page *: (struct slab *)(p))) /** * slab_page - The first struct page allocated for a slab * @slab: The slab. * * A convenience wrapper for converting slab to the first struct page of the * underlying folio, to communicate with code not yet converted to folio or * struct slab. */ #define slab_page(s) folio_page(slab_folio(s), 0) /* * If network-based swap is enabled, sl*b must keep track of whether pages * were allocated from pfmemalloc reserves. */ static inline bool slab_test_pfmemalloc(const struct slab *slab) { return folio_test_active(slab_folio(slab)); } static inline void slab_set_pfmemalloc(struct slab *slab) { folio_set_active(slab_folio(slab)); } static inline void slab_clear_pfmemalloc(struct slab *slab) { folio_clear_active(slab_folio(slab)); } static inline void __slab_clear_pfmemalloc(struct slab *slab) { __folio_clear_active(slab_folio(slab)); } static inline void *slab_address(const struct slab *slab) { return folio_address(slab_folio(slab)); } static inline int slab_nid(const struct slab *slab) { return folio_nid(slab_folio(slab)); } static inline pg_data_t *slab_pgdat(const struct slab *slab) { return folio_pgdat(slab_folio(slab)); } static inline struct slab *virt_to_slab(const void *addr) { struct folio *folio = virt_to_folio(addr); if (!folio_test_slab(folio)) return NULL; return folio_slab(folio); } static inline int slab_order(const struct slab *slab) { return folio_order(slab_folio(slab)); } static inline size_t slab_size(const struct slab *slab) { return PAGE_SIZE << slab_order(slab); } #ifdef CONFIG_SLUB_CPU_PARTIAL #define slub_percpu_partial(c) ((c)->partial) #define slub_set_percpu_partial(c, p) \ ({ \ slub_percpu_partial(c) = (p)->next; \ }) #define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) #else #define slub_percpu_partial(c) NULL #define slub_set_percpu_partial(c, p) #define slub_percpu_partial_read_once(c) NULL #endif // CONFIG_SLUB_CPU_PARTIAL /* * Word size structure that can be atomically updated or read and that * contains both the order and the number of objects that a slab of the * given order would contain. */ struct kmem_cache_order_objects { unsigned int x; }; /* * Slab cache management. */ struct kmem_cache { #ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; #endif /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; unsigned int size; /* Object size including metadata */ unsigned int object_size; /* Object size without metadata */ struct reciprocal_value reciprocal_size; unsigned int offset; /* Free pointer offset */ #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; /* Number of per cpu partial slabs to keep around */ unsigned int cpu_partial_slabs; #endif struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ struct kmem_cache_order_objects min; gfp_t allocflags; /* gfp flags to use on each alloc */ int refcount; /* Refcount for slab cache destroy */ void (*ctor)(void *object); /* Object constructor */ unsigned int inuse; /* Offset to metadata */ unsigned int align; /* Alignment */ unsigned int red_left_pad; /* Left redzone padding size */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ #endif #ifdef CONFIG_SLAB_FREELIST_HARDENED unsigned long random; #endif #ifdef CONFIG_NUMA /* * Defragmentation by allocating from a remote node. */ unsigned int remote_node_defrag_ratio; #endif #ifdef CONFIG_SLAB_FREELIST_RANDOM unsigned int *random_seq; #endif #ifdef CONFIG_KASAN_GENERIC struct kasan_cache kasan_info; #endif #ifdef CONFIG_HARDENED_USERCOPY unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ #endif struct kmem_cache_node *node[MAX_NUMNODES]; }; #if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) #define SLAB_SUPPORTS_SYSFS 1 void sysfs_slab_unlink(struct kmem_cache *s); void sysfs_slab_release(struct kmem_cache *s); #else static inline void sysfs_slab_unlink(struct kmem_cache *s) { } static inline void sysfs_slab_release(struct kmem_cache *s) { } #endif void *fixup_red_left(struct kmem_cache *s, void *p); static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, void *x) { void *object = x - (x - slab_address(slab)) % cache->size; void *last_object = slab_address(slab) + (slab->objects - 1) * cache->size; void *result = (unlikely(object > last_object)) ? last_object : object; result = fixup_red_left(cache, result); return result; } /* Determine object index from a given position */ static inline unsigned int __obj_to_index(const struct kmem_cache *cache, void *addr, void *obj) { return reciprocal_divide(kasan_reset_tag(obj) - addr, cache->reciprocal_size); } static inline unsigned int obj_to_index(const struct kmem_cache *cache, const struct slab *slab, void *obj) { if (is_kfence_address(obj)) return 0; return __obj_to_index(cache, slab_address(slab), obj); } static inline int objs_per_slab(const struct kmem_cache *cache, const struct slab *slab) { return slab->objects; } /* * State of the slab allocator. * * This is used to describe the states of the allocator during bootup. * Allocators use this to gradually bootstrap themselves. Most allocators * have the problem that the structures used for managing slab caches are * allocated from slab caches themselves. */ enum slab_state { DOWN, /* No slab functionality yet */ PARTIAL, /* SLUB: kmem_cache_node available */ UP, /* Slab caches usable but not all extras yet */ FULL /* Everything is working */ }; extern enum slab_state slab_state; /* The slab cache mutex protects the management structures during changes */ extern struct mutex slab_mutex; /* The list of all slab caches on the system */ extern struct list_head slab_caches; /* The slab cache that manages slab cache information */ extern struct kmem_cache *kmem_cache; /* A table of kmalloc cache names and sizes */ extern const struct kmalloc_info_struct { const char *name[NR_KMALLOC_TYPES]; unsigned int size; } kmalloc_info[]; /* Kmalloc array related functions */ void setup_kmalloc_cache_index_table(void); void create_kmalloc_caches(void); extern u8 kmalloc_size_index[24]; static inline unsigned int size_index_elem(unsigned int bytes) { return (bytes - 1) / 8; } /* * Find the kmem_cache structure that serves a given size of * allocation * * This assumes size is larger than zero and not larger than * KMALLOC_MAX_CACHE_SIZE and the caller must check that. */ static inline struct kmem_cache * kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, unsigned long caller) { unsigned int index; if (!b) b = &kmalloc_caches[kmalloc_type(flags, caller)]; if (size <= 192) index = kmalloc_size_index[size_index_elem(size)]; else index = fls(size - 1); return (*b)[index]; } gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ int do_kmem_cache_create(struct kmem_cache *s, const char *name, unsigned int size, struct kmem_cache_args *args, slab_flags_t flags); void __init kmem_cache_init(void); extern void create_boot_cache(struct kmem_cache *, const char *name, unsigned int size, slab_flags_t flags, unsigned int useroffset, unsigned int usersize); int slab_unmergeable(struct kmem_cache *s); struct kmem_cache *find_mergeable(unsigned size, unsigned align, slab_flags_t flags, const char *name, void (*ctor)(void *)); struct kmem_cache * __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name); static inline bool is_kmalloc_cache(struct kmem_cache *s) { return (s->flags & SLAB_KMALLOC); } static inline bool is_kmalloc_normal(struct kmem_cache *s) { if (!is_kmalloc_cache(s)) return false; return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); } /* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) #ifdef CONFIG_SLUB_DEBUG #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) #else #define SLAB_DEBUG_FLAGS (0) #endif #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | SLAB_ACCOUNT | \ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) /* Common flags available with current configuration */ #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) /* Common flags permitted for kmem_cache_create */ #define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ SLAB_RED_ZONE | \ SLAB_POISON | \ SLAB_STORE_USER | \ SLAB_TRACE | \ SLAB_CONSISTENCY_CHECKS | \ SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | \ SLAB_ACCOUNT | \ SLAB_KMALLOC | \ SLAB_NO_MERGE | \ SLAB_NO_USER_FLAGS) bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; struct file; struct slabinfo { unsigned long active_objs; unsigned long num_objs; unsigned long active_slabs; unsigned long num_slabs; unsigned long shared_avail; unsigned int limit; unsigned int batchcount; unsigned int shared; unsigned int objects_per_slab; unsigned int cache_order; }; void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON DECLARE_STATIC_KEY_TRUE(slub_debug_enabled); #else DECLARE_STATIC_KEY_FALSE(slub_debug_enabled); #endif extern void print_tracking(struct kmem_cache *s, void *object); long validate_slab_cache(struct kmem_cache *s); static inline bool __slub_debug_enabled(void) { return static_branch_unlikely(&slub_debug_enabled); } #else static inline void print_tracking(struct kmem_cache *s, void *object) { } static inline bool __slub_debug_enabled(void) { return false; } #endif /* * Returns true if any of the specified slab_debug flags is enabled for the * cache. Use only for flags parsed by setup_slub_debug() as it also enables * the static key. */ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags) { if (IS_ENABLED(CONFIG_SLUB_DEBUG)) VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS)); if (__slub_debug_enabled()) return s->flags & flags; return false; } #if IS_ENABLED(CONFIG_SLUB_DEBUG) && IS_ENABLED(CONFIG_KUNIT) bool slab_in_kunit_test(void); #else static inline bool slab_in_kunit_test(void) { return false; } #endif #ifdef CONFIG_SLAB_OBJ_EXT /* * slab_obj_exts - get the pointer to the slab object extension vector * associated with a slab. * @slab: a pointer to the slab struct * * Returns a pointer to the object extension vector associated with the slab, * or NULL if no such vector has been associated yet. */ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) { unsigned long obj_exts = READ_ONCE(slab->obj_exts); #ifdef CONFIG_MEMCG VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS), slab_page(slab)); VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); #endif return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); } int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab); #else /* CONFIG_SLAB_OBJ_EXT */ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) { return NULL; } #endif /* CONFIG_SLAB_OBJ_EXT */ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) { return (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; } #ifdef CONFIG_MEMCG bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, void **p); void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts); #endif size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) { #ifdef CONFIG_SLUB_DEBUG /* * Debugging requires use of the padding between object * and whatever may come after it. */ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) return s->object_size; #endif if (s->flags & SLAB_KASAN) return s->object_size; /* * If we have the need to store the freelist pointer * back there or track user information then we can * only use the space before that information. */ if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) return s->inuse; /* * Else we can use all the padding etc for the allocation */ return s->size; } #ifdef CONFIG_SLUB_DEBUG void dump_unreclaimable_slab(void); #else static inline void dump_unreclaimable_slab(void) { } #endif void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); #ifdef CONFIG_SLAB_FREELIST_RANDOM int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, gfp_t gfp); void cache_random_seq_destroy(struct kmem_cache *cachep); #else static inline int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, gfp_t gfp) { return 0; } static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) { if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc)) { if (c->ctor) return false; if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) return flags & __GFP_ZERO; return true; } return flags & __GFP_ZERO; } static inline bool slab_want_init_on_free(struct kmem_cache *c) { if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, &init_on_free)) return !(c->ctor || (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); return false; } #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) void debugfs_slab_release(struct kmem_cache *); #else static inline void debugfs_slab_release(struct kmem_cache *s) { } #endif #ifdef CONFIG_PRINTK #define KS_ADDRS_COUNT 16 struct kmem_obj_info { void *kp_ptr; struct slab *kp_slab; void *kp_objp; unsigned long kp_data_offset; struct kmem_cache *kp_slab_cache; void *kp_ret; void *kp_stack[KS_ADDRS_COUNT]; void *kp_free_stack[KS_ADDRS_COUNT]; }; void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab); #endif void __check_heap_object(const void *ptr, unsigned long n, const struct slab *slab, bool to_user); static inline bool slub_debug_orig_size(struct kmem_cache *s) { return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && (s->flags & SLAB_KMALLOC)); } #ifdef CONFIG_SLUB_DEBUG void skip_orig_size_check(struct kmem_cache *s, const void *object); #endif #endif /* MM_SLAB_H */
5 1 2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 // SPDX-License-Identifier: GPL-2.0-or-later /* * lib/ts_fsm.c A naive finite state machine text search approach * * Authors: Thomas Graf <tgraf@suug.ch> * * ========================================================================== * * A finite state machine consists of n states (struct ts_fsm_token) * representing the pattern as a finite automaton. The data is read * sequentially on an octet basis. Every state token specifies the number * of recurrences and the type of value accepted which can be either a * specific character or ctype based set of characters. The available * type of recurrences include 1, (0|1), [0 n], and [1 n]. * * The algorithm differs between strict/non-strict mode specifying * whether the pattern has to start at the first octet. Strict mode * is enabled by default and can be disabled by inserting * TS_FSM_HEAD_IGNORE as the first token in the chain. * * The runtime performance of the algorithm should be around O(n), * however while in strict mode the average runtime can be better. */ #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/textsearch.h> #include <linux/textsearch_fsm.h> struct ts_fsm { unsigned int ntokens; struct ts_fsm_token tokens[]; }; /* other values derived from ctype.h */ #define _A 0x100 /* ascii */ #define _W 0x200 /* wildcard */ /* Map to _ctype flags and some magic numbers */ static const u16 token_map[TS_FSM_TYPE_MAX+1] = { [TS_FSM_SPECIFIC] = 0, [TS_FSM_WILDCARD] = _W, [TS_FSM_CNTRL] = _C, [TS_FSM_LOWER] = _L, [TS_FSM_UPPER] = _U, [TS_FSM_PUNCT] = _P, [TS_FSM_SPACE] = _S, [TS_FSM_DIGIT] = _D, [TS_FSM_XDIGIT] = _D | _X, [TS_FSM_ALPHA] = _U | _L, [TS_FSM_ALNUM] = _U | _L | _D, [TS_FSM_PRINT] = _P | _U | _L | _D | _SP, [TS_FSM_GRAPH] = _P | _U | _L | _D, [TS_FSM_ASCII] = _A, }; static const u16 token_lookup_tbl[256] = { _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 0- 3 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 4- 7 */ _W|_A|_C, _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C|_S, /* 8- 11 */ _W|_A|_C|_S, _W|_A|_C|_S, _W|_A|_C, _W|_A|_C, /* 12- 15 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 16- 19 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 20- 23 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 24- 27 */ _W|_A|_C, _W|_A|_C, _W|_A|_C, _W|_A|_C, /* 28- 31 */ _W|_A|_S|_SP, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 32- 35 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 36- 39 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 40- 43 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 44- 47 */ _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 48- 51 */ _W|_A|_D, _W|_A|_D, _W|_A|_D, _W|_A|_D, /* 52- 55 */ _W|_A|_D, _W|_A|_D, _W|_A|_P, _W|_A|_P, /* 56- 59 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 60- 63 */ _W|_A|_P, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, /* 64- 67 */ _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U|_X, _W|_A|_U, /* 68- 71 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 72- 75 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 76- 79 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 80- 83 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_U, /* 84- 87 */ _W|_A|_U, _W|_A|_U, _W|_A|_U, _W|_A|_P, /* 88- 91 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_P, /* 92- 95 */ _W|_A|_P, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, /* 96- 99 */ _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L|_X, _W|_A|_L, /* 100-103 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 104-107 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 108-111 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 112-115 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_L, /* 116-119 */ _W|_A|_L, _W|_A|_L, _W|_A|_L, _W|_A|_P, /* 120-123 */ _W|_A|_P, _W|_A|_P, _W|_A|_P, _W|_A|_C, /* 124-127 */ _W, _W, _W, _W, /* 128-131 */ _W, _W, _W, _W, /* 132-135 */ _W, _W, _W, _W, /* 136-139 */ _W, _W, _W, _W, /* 140-143 */ _W, _W, _W, _W, /* 144-147 */ _W, _W, _W, _W, /* 148-151 */ _W, _W, _W, _W, /* 152-155 */ _W, _W, _W, _W, /* 156-159 */ _W|_S|_SP, _W|_P, _W|_P, _W|_P, /* 160-163 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 164-167 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 168-171 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 172-175 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 176-179 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 180-183 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 184-187 */ _W|_P, _W|_P, _W|_P, _W|_P, /* 188-191 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 192-195 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 196-199 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 200-203 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 204-207 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 208-211 */ _W|_U, _W|_U, _W|_U, _W|_P, /* 212-215 */ _W|_U, _W|_U, _W|_U, _W|_U, /* 216-219 */ _W|_U, _W|_U, _W|_U, _W|_L, /* 220-223 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 224-227 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 228-231 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 232-235 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 236-239 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 240-243 */ _W|_L, _W|_L, _W|_L, _W|_P, /* 244-247 */ _W|_L, _W|_L, _W|_L, _W|_L, /* 248-251 */ _W|_L, _W|_L, _W|_L, _W|_L}; /* 252-255 */ static inline int match_token(struct ts_fsm_token *t, u8 d) { if (t->type) return (token_lookup_tbl[d] & t->type) != 0; else return t->value == d; } static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state) { struct ts_fsm *fsm = ts_config_priv(conf); struct ts_fsm_token *cur = NULL, *next; unsigned int match_start, block_idx = 0, tok_idx; unsigned block_len = 0, strict, consumed = state->offset; const u8 *data; #define GET_NEXT_BLOCK() \ ({ consumed += block_idx; \ block_idx = 0; \ block_len = conf->get_next_block(consumed, &data, conf, state); }) #define TOKEN_MISMATCH() \ do { \ if (strict) \ goto no_match; \ block_idx++; \ goto startover; \ } while(0) #define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK()) if (end_of_data()) goto no_match; strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE; startover: match_start = consumed + block_idx; for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) { cur = &fsm->tokens[tok_idx]; if (likely(tok_idx < (fsm->ntokens - 1))) next = &fsm->tokens[tok_idx + 1]; else next = NULL; switch (cur->recur) { case TS_FSM_SINGLE: if (end_of_data()) goto no_match; if (!match_token(cur, data[block_idx])) TOKEN_MISMATCH(); break; case TS_FSM_PERHAPS: if (end_of_data() || !match_token(cur, data[block_idx])) continue; break; case TS_FSM_MULTI: if (end_of_data()) goto no_match; if (!match_token(cur, data[block_idx])) TOKEN_MISMATCH(); block_idx++; fallthrough; case TS_FSM_ANY: if (next == NULL) goto found_match; if (end_of_data()) continue; while (!match_token(next, data[block_idx])) { if (!match_token(cur, data[block_idx])) TOKEN_MISMATCH(); block_idx++; if (end_of_data()) goto no_match; } continue; /* * Optimization: Prefer small local loop over jumping * back and forth until garbage at head is munched. */ case TS_FSM_HEAD_IGNORE: if (end_of_data()) continue; while (!match_token(next, data[block_idx])) { /* * Special case, don't start over upon * a mismatch, give the user the * chance to specify the type of data * allowed to be ignored. */ if (!match_token(cur, data[block_idx])) goto no_match; block_idx++; if (end_of_data()) goto no_match; } match_start = consumed + block_idx; continue; } block_idx++; } if (end_of_data()) goto found_match; no_match: return UINT_MAX; found_match: state->offset = consumed + block_idx; return match_start; } static struct ts_config *fsm_init(const void *pattern, unsigned int len, gfp_t gfp_mask, int flags) { int i, err = -EINVAL; struct ts_config *conf; struct ts_fsm *fsm; struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern; unsigned int ntokens = len / sizeof(*tokens); size_t priv_size = sizeof(*fsm) + len; if (len % sizeof(struct ts_fsm_token) || ntokens < 1) goto errout; if (flags & TS_IGNORECASE) goto errout; for (i = 0; i < ntokens; i++) { struct ts_fsm_token *t = &tokens[i]; if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX) goto errout; if (t->recur == TS_FSM_HEAD_IGNORE && (i != 0 || i == (ntokens - 1))) goto errout; } conf = alloc_ts_config(priv_size, gfp_mask); if (IS_ERR(conf)) return conf; conf->flags = flags; fsm = ts_config_priv(conf); fsm->ntokens = ntokens; memcpy(fsm->tokens, pattern, len); for (i = 0; i < fsm->ntokens; i++) { struct ts_fsm_token *t = &fsm->tokens[i]; t->type = token_map[t->type]; } return conf; errout: return ERR_PTR(err); } static void *fsm_get_pattern(struct ts_config *conf) { struct ts_fsm *fsm = ts_config_priv(conf); return fsm->tokens; } static unsigned int fsm_get_pattern_len(struct ts_config *conf) { struct ts_fsm *fsm = ts_config_priv(conf); return fsm->ntokens * sizeof(struct ts_fsm_token); } static struct ts_ops fsm_ops = { .name = "fsm", .find = fsm_find, .init = fsm_init, .get_pattern = fsm_get_pattern, .get_pattern_len = fsm_get_pattern_len, .owner = THIS_MODULE, .list = LIST_HEAD_INIT(fsm_ops.list) }; static int __init init_fsm(void) { return textsearch_register(&fsm_ops); } static void __exit exit_fsm(void) { textsearch_unregister(&fsm_ops); } MODULE_DESCRIPTION("naive finite state machine text search"); MODULE_LICENSE("GPL"); module_init(init_fsm); module_exit(exit_fsm);
2 2 1 1 1 5 5 4 5 5 5 12 12 12 11 11 5 10 5 5 2 10 10 1 7 1 4 2 7 1 7 2 2 2 3 3 3 931 921 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2007-2012 Siemens AG * * Written by: * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Sergey Lapin <slapin@ossfans.org> * Maxim Gorbachyov <maxim.gorbachev@siemens.com> * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #include <linux/netdevice.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/ieee802154.h> #include <net/nl802154.h> #include <net/mac802154.h> #include <net/ieee802154_netdev.h> #include <net/cfg802154.h> #include "ieee802154_i.h" #include "driver-ops.h" int mac802154_wpan_update_llsec(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; int rc = 0; if (ops->llsec) { struct ieee802154_llsec_params params; int changed = 0; params.pan_id = wpan_dev->pan_id; changed |= IEEE802154_LLSEC_PARAM_PAN_ID; params.hwaddr = wpan_dev->extended_addr; changed |= IEEE802154_LLSEC_PARAM_HWADDR; rc = ops->llsec->set_params(dev, &params, changed); } return rc; } static int mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct sockaddr_ieee802154 *sa = (struct sockaddr_ieee802154 *)&ifr->ifr_addr; int err = -ENOIOCTLCMD; if (cmd != SIOCGIFADDR && cmd != SIOCSIFADDR) return err; rtnl_lock(); switch (cmd) { case SIOCGIFADDR: { u16 pan_id, short_addr; pan_id = le16_to_cpu(wpan_dev->pan_id); short_addr = le16_to_cpu(wpan_dev->short_addr); if (pan_id == IEEE802154_PANID_BROADCAST || short_addr == IEEE802154_ADDR_BROADCAST) { err = -EADDRNOTAVAIL; break; } sa->family = AF_IEEE802154; sa->addr.addr_type = IEEE802154_ADDR_SHORT; sa->addr.pan_id = pan_id; sa->addr.short_addr = short_addr; err = 0; break; } case SIOCSIFADDR: if (netif_running(dev)) { rtnl_unlock(); return -EBUSY; } dev_warn(&dev->dev, "Using DEBUGing ioctl SIOCSIFADDR isn't recommended!\n"); if (sa->family != AF_IEEE802154 || sa->addr.addr_type != IEEE802154_ADDR_SHORT || sa->addr.pan_id == IEEE802154_PANID_BROADCAST || sa->addr.short_addr == IEEE802154_ADDR_BROADCAST || sa->addr.short_addr == IEEE802154_ADDR_UNDEF) { err = -EINVAL; break; } wpan_dev->pan_id = cpu_to_le16(sa->addr.pan_id); wpan_dev->short_addr = cpu_to_le16(sa->addr.short_addr); err = mac802154_wpan_update_llsec(dev); break; } rtnl_unlock(); return err; } static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct sockaddr *addr = p; __le64 extended_addr; if (netif_running(dev)) return -EBUSY; /* lowpan need to be down for update * SLAAC address after ifup */ if (sdata->wpan_dev.lowpan_dev) { if (netif_running(sdata->wpan_dev.lowpan_dev)) return -EBUSY; } ieee802154_be64_to_le64(&extended_addr, addr->sa_data); if (!ieee802154_is_valid_extended_unicast_addr(extended_addr)) return -EINVAL; dev_addr_set(dev, addr->sa_data); sdata->wpan_dev.extended_addr = extended_addr; /* update lowpan interface mac address when * wpan mac has been changed */ if (sdata->wpan_dev.lowpan_dev) dev_addr_set(sdata->wpan_dev.lowpan_dev, dev->dev_addr); return mac802154_wpan_update_llsec(dev); } static int ieee802154_setup_hw(struct ieee802154_sub_if_data *sdata) { struct ieee802154_local *local = sdata->local; struct wpan_dev *wpan_dev = &sdata->wpan_dev; int ret; sdata->required_filtering = sdata->iface_default_filtering; if (local->hw.flags & IEEE802154_HW_AFILT) { local->addr_filt.pan_id = wpan_dev->pan_id; local->addr_filt.ieee_addr = wpan_dev->extended_addr; local->addr_filt.short_addr = wpan_dev->short_addr; } if (local->hw.flags & IEEE802154_HW_LBT) { ret = drv_set_lbt_mode(local, wpan_dev->lbt); if (ret < 0) return ret; } if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { ret = drv_set_csma_params(local, wpan_dev->min_be, wpan_dev->max_be, wpan_dev->csma_retries); if (ret < 0) return ret; } if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { ret = drv_set_max_frame_retries(local, wpan_dev->frame_retries); if (ret < 0) return ret; } return 0; } static int mac802154_slave_open(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; int res; ASSERT_RTNL(); set_bit(SDATA_STATE_RUNNING, &sdata->state); if (!local->open_count) { res = ieee802154_setup_hw(sdata); if (res) goto err; res = drv_start(local, sdata->required_filtering, &local->addr_filt); if (res) goto err; } local->open_count++; netif_start_queue(dev); return 0; err: /* might already be clear but that doesn't matter */ clear_bit(SDATA_STATE_RUNNING, &sdata->state); return res; } static int ieee802154_check_mac_settings(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct ieee802154_sub_if_data *nsdata) { struct wpan_dev *nwpan_dev = &nsdata->wpan_dev; struct wpan_dev *wpan_dev = &sdata->wpan_dev; ASSERT_RTNL(); if (sdata->iface_default_filtering != nsdata->iface_default_filtering) return -EBUSY; if (local->hw.flags & IEEE802154_HW_AFILT) { if (wpan_dev->pan_id != nwpan_dev->pan_id || wpan_dev->short_addr != nwpan_dev->short_addr || wpan_dev->extended_addr != nwpan_dev->extended_addr) return -EBUSY; } if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { if (wpan_dev->min_be != nwpan_dev->min_be || wpan_dev->max_be != nwpan_dev->max_be || wpan_dev->csma_retries != nwpan_dev->csma_retries) return -EBUSY; } if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { if (wpan_dev->frame_retries != nwpan_dev->frame_retries) return -EBUSY; } if (local->hw.flags & IEEE802154_HW_LBT) { if (wpan_dev->lbt != nwpan_dev->lbt) return -EBUSY; } return 0; } static int ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata, enum nl802154_iftype iftype) { struct ieee802154_local *local = sdata->local; struct ieee802154_sub_if_data *nsdata; /* we hold the RTNL here so can safely walk the list */ list_for_each_entry(nsdata, &local->interfaces, list) { if (nsdata != sdata && ieee802154_sdata_running(nsdata)) { int ret; /* TODO currently we don't support multiple node/coord * types we need to run skb_clone at rx path. Check if * there exist really an use case if we need to support * multiple node/coord types at the same time. */ if (sdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR && nsdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR) return -EBUSY; /* check all phy mac sublayer settings are the same. * We have only one phy, different values makes trouble. */ ret = ieee802154_check_mac_settings(local, sdata, nsdata); if (ret < 0) return ret; } } return 0; } static int mac802154_wpan_open(struct net_device *dev) { int rc; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; rc = ieee802154_check_concurrent_iface(sdata, wpan_dev->iftype); if (rc < 0) return rc; return mac802154_slave_open(dev); } static int mac802154_slave_close(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; ASSERT_RTNL(); if (mac802154_is_scanning(local)) mac802154_abort_scan_locked(local, sdata); if (mac802154_is_beaconing(local)) mac802154_stop_beacons_locked(local, sdata); netif_stop_queue(dev); local->open_count--; clear_bit(SDATA_STATE_RUNNING, &sdata->state); if (!local->open_count) ieee802154_stop_device(local); return 0; } static int mac802154_set_header_security(struct ieee802154_sub_if_data *sdata, struct ieee802154_hdr *hdr, const struct ieee802154_mac_cb *cb) { struct ieee802154_llsec_params params; u8 level; mac802154_llsec_get_params(&sdata->sec, &params); if (!params.enabled && cb->secen_override && cb->secen) return -EINVAL; if (!params.enabled || (cb->secen_override && !cb->secen) || !params.out_level) return 0; if (cb->seclevel_override && !cb->seclevel) return -EINVAL; level = cb->seclevel_override ? cb->seclevel : params.out_level; hdr->fc.security_enabled = 1; hdr->sec.level = level; hdr->sec.key_id_mode = params.out_key.mode; if (params.out_key.mode == IEEE802154_SCF_KEY_SHORT_INDEX) hdr->sec.short_src = params.out_key.short_source; else if (params.out_key.mode == IEEE802154_SCF_KEY_HW_INDEX) hdr->sec.extended_src = params.out_key.extended_source; hdr->sec.key_id = params.out_key.id; return 0; } static int ieee802154_header_create(struct sk_buff *skb, struct net_device *dev, const struct ieee802154_addr *daddr, const struct ieee802154_addr *saddr, unsigned len) { struct ieee802154_hdr hdr; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct ieee802154_mac_cb *cb = mac_cb(skb); int hlen; if (!daddr) return -EINVAL; memset(&hdr.fc, 0, sizeof(hdr.fc)); hdr.fc.type = cb->type; hdr.fc.security_enabled = cb->secen; hdr.fc.ack_request = cb->ackreq; hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; if (mac802154_set_header_security(sdata, &hdr, cb) < 0) return -EINVAL; if (!saddr) { if (wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST) || wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) || wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) { hdr.source.mode = IEEE802154_ADDR_LONG; hdr.source.extended_addr = wpan_dev->extended_addr; } else { hdr.source.mode = IEEE802154_ADDR_SHORT; hdr.source.short_addr = wpan_dev->short_addr; } hdr.source.pan_id = wpan_dev->pan_id; } else { hdr.source = *(const struct ieee802154_addr *)saddr; } hdr.dest = *(const struct ieee802154_addr *)daddr; hlen = ieee802154_hdr_push(skb, &hdr); if (hlen < 0) return -EINVAL; skb_reset_mac_header(skb); skb->mac_len = hlen; if (len > ieee802154_max_payload(&hdr)) return -EMSGSIZE; return hlen; } static const struct wpan_dev_header_ops ieee802154_header_ops = { .create = ieee802154_header_create, }; /* This header create functionality assumes a 8 byte array for * source and destination pointer at maximum. To adapt this for * the 802.15.4 dataframe header we use extended address handling * here only and intra pan connection. fc fields are mostly fallback * handling. For provide dev_hard_header for dgram sockets. */ static int mac802154_header_create(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { struct ieee802154_hdr hdr; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct ieee802154_mac_cb cb = { }; int hlen; if (!daddr) return -EINVAL; memset(&hdr.fc, 0, sizeof(hdr.fc)); hdr.fc.type = IEEE802154_FC_TYPE_DATA; hdr.fc.ack_request = wpan_dev->ackreq; hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; /* TODO currently a workaround to give zero cb block to set * security parameters defaults according MIB. */ if (mac802154_set_header_security(sdata, &hdr, &cb) < 0) return -EINVAL; hdr.dest.pan_id = wpan_dev->pan_id; hdr.dest.mode = IEEE802154_ADDR_LONG; ieee802154_be64_to_le64(&hdr.dest.extended_addr, daddr); hdr.source.pan_id = hdr.dest.pan_id; hdr.source.mode = IEEE802154_ADDR_LONG; if (!saddr) hdr.source.extended_addr = wpan_dev->extended_addr; else ieee802154_be64_to_le64(&hdr.source.extended_addr, saddr); hlen = ieee802154_hdr_push(skb, &hdr); if (hlen < 0) return -EINVAL; skb_reset_mac_header(skb); skb->mac_len = hlen; if (len > ieee802154_max_payload(&hdr)) return -EMSGSIZE; return hlen; } static int mac802154_header_parse(const struct sk_buff *skb, unsigned char *haddr) { struct ieee802154_hdr hdr; if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) { pr_debug("malformed packet\n"); return 0; } if (hdr.source.mode == IEEE802154_ADDR_LONG) { ieee802154_le64_to_be64(haddr, &hdr.source.extended_addr); return IEEE802154_EXTENDED_ADDR_LEN; } return 0; } static const struct header_ops mac802154_header_ops = { .create = mac802154_header_create, .parse = mac802154_header_parse, }; static const struct net_device_ops mac802154_wpan_ops = { .ndo_open = mac802154_wpan_open, .ndo_stop = mac802154_slave_close, .ndo_start_xmit = ieee802154_subif_start_xmit, .ndo_do_ioctl = mac802154_wpan_ioctl, .ndo_set_mac_address = mac802154_wpan_mac_addr, }; static const struct net_device_ops mac802154_monitor_ops = { .ndo_open = mac802154_wpan_open, .ndo_stop = mac802154_slave_close, .ndo_start_xmit = ieee802154_monitor_start_xmit, }; static void mac802154_wpan_free(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); mac802154_llsec_destroy(&sdata->sec); } static void ieee802154_if_setup(struct net_device *dev) { dev->addr_len = IEEE802154_EXTENDED_ADDR_LEN; memset(dev->broadcast, 0xff, IEEE802154_EXTENDED_ADDR_LEN); /* Let hard_header_len set to IEEE802154_MIN_HEADER_LEN. AF_PACKET * will not send frames without any payload, but ack frames * has no payload, so substract one that we can send a 3 bytes * frame. The xmit callback assumes at least a hard header where two * bytes fc and sequence field are set. */ dev->hard_header_len = IEEE802154_MIN_HEADER_LEN - 1; /* The auth_tag header is for security and places in private payload * room of mac frame which stucks between payload and FCS field. */ dev->needed_tailroom = IEEE802154_MAX_AUTH_TAG_LEN + IEEE802154_FCS_LEN; /* The mtu size is the payload without mac header in this case. * We have a dynamic length header with a minimum header length * which is hard_header_len. In this case we let mtu to the size * of maximum payload which is IEEE802154_MTU - IEEE802154_FCS_LEN - * hard_header_len. The FCS which is set by hardware or ndo_start_xmit * and the minimum mac header which can be evaluated inside driver * layer. The rest of mac header will be part of payload if greater * than hard_header_len. */ dev->mtu = IEEE802154_MTU - IEEE802154_FCS_LEN - dev->hard_header_len; dev->tx_queue_len = 300; dev->flags = IFF_NOARP | IFF_BROADCAST; } static int ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, enum nl802154_iftype type) { struct wpan_dev *wpan_dev = &sdata->wpan_dev; int ret; u8 tmp; /* set some type-dependent values */ sdata->wpan_dev.iftype = type; get_random_bytes(&tmp, sizeof(tmp)); atomic_set(&wpan_dev->bsn, tmp); get_random_bytes(&tmp, sizeof(tmp)); atomic_set(&wpan_dev->dsn, tmp); /* defaults per 802.15.4-2011 */ wpan_dev->min_be = 3; wpan_dev->max_be = 5; wpan_dev->csma_retries = 4; wpan_dev->frame_retries = 3; wpan_dev->pan_id = cpu_to_le16(IEEE802154_PANID_BROADCAST); wpan_dev->short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); switch (type) { case NL802154_IFTYPE_COORD: case NL802154_IFTYPE_NODE: ieee802154_be64_to_le64(&wpan_dev->extended_addr, sdata->dev->dev_addr); sdata->dev->header_ops = &mac802154_header_ops; sdata->dev->needs_free_netdev = true; sdata->dev->priv_destructor = mac802154_wpan_free; sdata->dev->netdev_ops = &mac802154_wpan_ops; sdata->dev->ml_priv = &mac802154_mlme_wpan; sdata->iface_default_filtering = IEEE802154_FILTERING_4_FRAME_FIELDS; wpan_dev->header_ops = &ieee802154_header_ops; mutex_init(&sdata->sec_mtx); mac802154_llsec_init(&sdata->sec); ret = mac802154_wpan_update_llsec(sdata->dev); if (ret < 0) return ret; break; case NL802154_IFTYPE_MONITOR: sdata->dev->needs_free_netdev = true; sdata->dev->netdev_ops = &mac802154_monitor_ops; sdata->iface_default_filtering = IEEE802154_FILTERING_NONE; break; default: BUG(); } return 0; } struct net_device * ieee802154_if_add(struct ieee802154_local *local, const char *name, unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { u8 addr[IEEE802154_EXTENDED_ADDR_LEN]; struct net_device *ndev = NULL; struct ieee802154_sub_if_data *sdata = NULL; int ret; ASSERT_RTNL(); ndev = alloc_netdev(sizeof(*sdata), name, name_assign_type, ieee802154_if_setup); if (!ndev) return ERR_PTR(-ENOMEM); ndev->needed_headroom = local->hw.extra_tx_headroom + IEEE802154_MAX_HEADER_LEN; ret = dev_alloc_name(ndev, ndev->name); if (ret < 0) goto err; ieee802154_le64_to_be64(ndev->perm_addr, &local->hw.phy->perm_extended_addr); switch (type) { case NL802154_IFTYPE_COORD: case NL802154_IFTYPE_NODE: ndev->type = ARPHRD_IEEE802154; if (ieee802154_is_valid_extended_unicast_addr(extended_addr)) { ieee802154_le64_to_be64(addr, &extended_addr); dev_addr_set(ndev, addr); } else { dev_addr_set(ndev, ndev->perm_addr); } break; case NL802154_IFTYPE_MONITOR: ndev->type = ARPHRD_IEEE802154_MONITOR; break; default: ret = -EINVAL; goto err; } /* TODO check this */ SET_NETDEV_DEV(ndev, &local->phy->dev); dev_net_set(ndev, wpan_phy_net(local->hw.phy)); sdata = netdev_priv(ndev); ndev->ieee802154_ptr = &sdata->wpan_dev; memcpy(sdata->name, ndev->name, IFNAMSIZ); sdata->dev = ndev; sdata->wpan_dev.wpan_phy = local->hw.phy; sdata->local = local; INIT_LIST_HEAD(&sdata->wpan_dev.list); /* setup type-dependent data */ ret = ieee802154_setup_sdata(sdata, type); if (ret) goto err; ret = register_netdevice(ndev); if (ret < 0) goto err; mutex_lock(&local->iflist_mtx); list_add_tail_rcu(&sdata->list, &local->interfaces); mutex_unlock(&local->iflist_mtx); return ndev; err: free_netdev(ndev); return ERR_PTR(ret); } void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata) { ASSERT_RTNL(); mutex_lock(&sdata->local->iflist_mtx); list_del_rcu(&sdata->list); mutex_unlock(&sdata->local->iflist_mtx); synchronize_rcu(); unregister_netdevice(sdata->dev); } void ieee802154_remove_interfaces(struct ieee802154_local *local) { struct ieee802154_sub_if_data *sdata, *tmp; mutex_lock(&local->iflist_mtx); list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) { list_del(&sdata->list); unregister_netdevice(sdata->dev); } mutex_unlock(&local->iflist_mtx); } static int netdev_notify(struct notifier_block *nb, unsigned long state, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ieee802154_sub_if_data *sdata; if (state != NETDEV_CHANGENAME) return NOTIFY_DONE; if (!dev->ieee802154_ptr || !dev->ieee802154_ptr->wpan_phy) return NOTIFY_DONE; if (dev->ieee802154_ptr->wpan_phy->privid != mac802154_wpan_phy_privid) return NOTIFY_DONE; sdata = IEEE802154_DEV_TO_SUB_IF(dev); memcpy(sdata->name, dev->name, IFNAMSIZ); return NOTIFY_OK; } static struct notifier_block mac802154_netdev_notifier = { .notifier_call = netdev_notify, }; int ieee802154_iface_init(void) { return register_netdevice_notifier(&mac802154_netdev_notifier); } void ieee802154_iface_exit(void) { unregister_netdevice_notifier(&mac802154_netdev_notifier); }
6 1 1 1 1 1 4 4 3 3 4 3 3 2 5 5 3 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 // SPDX-License-Identifier: GPL-2.0-or-later /* * Glue Code for x86_64/AVX2 assembler optimized version of Serpent * * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> */ #include <linux/module.h> #include <linux/types.h> #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <crypto/serpent.h> #include "serpent-avx.h" #include "ecb_cbc_helpers.h" #define SERPENT_AVX2_PARALLEL_BLOCKS 16 /* 16-way AVX2 parallel cipher functions */ asmlinkage void serpent_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); static int serpent_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen); } static int ecb_encrypt(struct skcipher_request *req) { ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_enc_16way); ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_enc_8way_avx); ECB_BLOCK(1, __serpent_encrypt); ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { ECB_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); ECB_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_ecb_dec_16way); ECB_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_ecb_dec_8way_avx); ECB_BLOCK(1, __serpent_decrypt); ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { CBC_WALK_START(req, SERPENT_BLOCK_SIZE, -1); CBC_ENC_BLOCK(__serpent_encrypt); CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { CBC_WALK_START(req, SERPENT_BLOCK_SIZE, SERPENT_PARALLEL_BLOCKS); CBC_DEC_BLOCK(SERPENT_AVX2_PARALLEL_BLOCKS, serpent_cbc_dec_16way); CBC_DEC_BLOCK(SERPENT_PARALLEL_BLOCKS, serpent_cbc_dec_8way_avx); CBC_DEC_BLOCK(1, __serpent_decrypt); CBC_WALK_END(); } static struct skcipher_alg serpent_algs[] = { { .base.cra_name = "__ecb(serpent)", .base.cra_driver_name = "__ecb-serpent-avx2", .base.cra_priority = 600, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, .min_keysize = SERPENT_MIN_KEY_SIZE, .max_keysize = SERPENT_MAX_KEY_SIZE, .setkey = serpent_setkey_skcipher, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base.cra_name = "__cbc(serpent)", .base.cra_driver_name = "__cbc-serpent-avx2", .base.cra_priority = 600, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, .min_keysize = SERPENT_MIN_KEY_SIZE, .max_keysize = SERPENT_MAX_KEY_SIZE, .ivsize = SERPENT_BLOCK_SIZE, .setkey = serpent_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, }, }; static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; static int __init serpent_avx2_init(void) { const char *feature_name; if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX2 instructions are not detected.\n"); return -ENODEV; } if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } return simd_register_skciphers_compat(serpent_algs, ARRAY_SIZE(serpent_algs), serpent_simd_algs); } static void __exit serpent_avx2_fini(void) { simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), serpent_simd_algs); } module_init(serpent_avx2_init); module_exit(serpent_avx2_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized"); MODULE_ALIAS_CRYPTO("serpent"); MODULE_ALIAS_CRYPTO("serpent-asm");
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct wol_req_info { struct ethnl_req_info base; }; struct wol_reply_data { struct ethnl_reply_data base; struct ethtool_wolinfo wol; bool show_sopass; }; #define WOL_REPDATA(__reply_base) \ container_of(__reply_base, struct wol_reply_data, base) const struct nla_policy ethnl_wol_get_policy[] = { [ETHTOOL_A_WOL_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int wol_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct wol_reply_data *data = WOL_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_wol) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; dev->ethtool_ops->get_wol(dev, &data->wol); ethnl_ops_complete(dev); /* do not include password in notifications */ data->show_sopass = !genl_info_is_ntf(info) && (data->wol.supported & WAKE_MAGICSECURE); return 0; } static int wol_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct wol_reply_data *data = WOL_REPDATA(reply_base); int len; len = ethnl_bitset32_size(&data->wol.wolopts, &data->wol.supported, WOL_MODE_COUNT, wol_mode_names, compact); if (len < 0) return len; if (data->show_sopass) len += nla_total_size(sizeof(data->wol.sopass)); return len; } static int wol_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct wol_reply_data *data = WOL_REPDATA(reply_base); int ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_WOL_MODES, &data->wol.wolopts, &data->wol.supported, WOL_MODE_COUNT, wol_mode_names, compact); if (ret < 0) return ret; if (data->show_sopass && nla_put(skb, ETHTOOL_A_WOL_SOPASS, sizeof(data->wol.sopass), data->wol.sopass)) return -EMSGSIZE; return 0; } /* WOL_SET */ const struct nla_policy ethnl_wol_set_policy[] = { [ETHTOOL_A_WOL_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_WOL_MODES] = { .type = NLA_NESTED }, [ETHTOOL_A_WOL_SOPASS] = { .type = NLA_BINARY, .len = SOPASS_MAX }, }; static int ethnl_set_wol_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_wol && ops->set_wol ? 1 : -EOPNOTSUPP; } static int ethnl_set_wol(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; int ret; dev->ethtool_ops->get_wol(dev, &wol); ret = ethnl_update_bitset32(&wol.wolopts, WOL_MODE_COUNT, tb[ETHTOOL_A_WOL_MODES], wol_mode_names, info->extack, &mod); if (ret < 0) return ret; if (wol.wolopts & ~wol.supported) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_MODES], "cannot enable unsupported WoL mode"); return -EINVAL; } if (tb[ETHTOOL_A_WOL_SOPASS]) { if (!(wol.supported & WAKE_MAGICSECURE)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_SOPASS], "magicsecure not supported, cannot set password"); return -EINVAL; } ethnl_update_binary(wol.sopass, sizeof(wol.sopass), tb[ETHTOOL_A_WOL_SOPASS], &mod); } if (!mod) return 0; ret = dev->ethtool_ops->set_wol(dev, &wol); if (ret) return ret; dev->ethtool->wol_enabled = !!wol.wolopts; return 1; } const struct ethnl_request_ops ethnl_wol_request_ops = { .request_cmd = ETHTOOL_MSG_WOL_GET, .reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY, .hdr_attr = ETHTOOL_A_WOL_HEADER, .req_info_size = sizeof(struct wol_req_info), .reply_data_size = sizeof(struct wol_reply_data), .prepare_data = wol_prepare_data, .reply_size = wol_reply_size, .fill_reply = wol_fill_reply, .set_validate = ethnl_set_wol_validate, .set = ethnl_set_wol, .set_ntf_cmd = ETHTOOL_MSG_WOL_NTF, };
7 4 5 4 1 4 3 3 6 5 1 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for setting the MSS option in TCP packets. * * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> * Copyright (C) 2007 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/gfp.h> #include <linux/ipv6.h> #include <linux/tcp.h> #include <net/dst.h> #include <net/flow.h> #include <net/ipv6.h> #include <net/route.h> #include <net/tcp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_tcpudp.h> #include <linux/netfilter/xt_TCPMSS.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_DESCRIPTION("Xtables: TCP Maximum Segment Size (MSS) adjustment"); MODULE_ALIAS("ipt_TCPMSS"); MODULE_ALIAS("ip6t_TCPMSS"); static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) { /* Beware zero-length options: make finite progress */ if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) return 1; else return opt[offset+1]; } static u_int32_t tcpmss_reverse_mtu(struct net *net, const struct sk_buff *skb, unsigned int family) { struct flowi fl; struct rtable *rt = NULL; u_int32_t mtu = ~0U; if (family == PF_INET) { struct flowi4 *fl4 = &fl.u.ip4; memset(fl4, 0, sizeof(*fl4)); fl4->daddr = ip_hdr(skb)->saddr; } else { struct flowi6 *fl6 = &fl.u.ip6; memset(fl6, 0, sizeof(*fl6)); fl6->daddr = ipv6_hdr(skb)->saddr; } nf_route(net, (struct dst_entry **)&rt, &fl, false, family); if (rt != NULL) { mtu = dst_mtu(&rt->dst); dst_release(&rt->dst); } return mtu; } static int tcpmss_mangle_packet(struct sk_buff *skb, const struct xt_action_param *par, unsigned int family, unsigned int tcphoff, unsigned int minlen) { const struct xt_tcpmss_info *info = par->targinfo; struct tcphdr *tcph; int len, tcp_hdrlen; unsigned int i; __be16 oldval; u16 newmss; u8 *opt; /* This is a fragment, no TCP header is available */ if (par->fragoff != 0) return 0; if (skb_ensure_writable(skb, skb->len)) return -1; len = skb->len - tcphoff; if (len < (int)sizeof(struct tcphdr)) return -1; tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); tcp_hdrlen = tcph->doff * 4; if (len < tcp_hdrlen || tcp_hdrlen < sizeof(struct tcphdr)) return -1; if (info->mss == XT_TCPMSS_CLAMP_PMTU) { struct net *net = xt_net(par); unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu); if (min_mtu <= minlen) { net_err_ratelimited("unknown or invalid path-MTU (%u)\n", min_mtu); return -1; } newmss = min_mtu - minlen; } else newmss = info->mss; opt = (u_int8_t *)tcph; for (i = sizeof(struct tcphdr); i <= tcp_hdrlen - TCPOLEN_MSS; i += optlen(opt, i)) { if (opt[i] == TCPOPT_MSS && opt[i+1] == TCPOLEN_MSS) { u_int16_t oldmss; oldmss = (opt[i+2] << 8) | opt[i+3]; /* Never increase MSS, even when setting it, as * doing so results in problems for hosts that rely * on MSS being set correctly. */ if (oldmss <= newmss) return 0; opt[i+2] = (newmss & 0xff00) >> 8; opt[i+3] = newmss & 0x00ff; inet_proto_csum_replace2(&tcph->check, skb, htons(oldmss), htons(newmss), false); return 0; } } /* There is data after the header so the option can't be added * without moving it, and doing so may make the SYN packet * itself too large. Accept the packet unmodified instead. */ if (len > tcp_hdrlen) return 0; /* tcph->doff has 4 bits, do not wrap it to 0 */ if (tcp_hdrlen >= 15 * 4) return 0; /* * MSS Option not found ?! add it.. */ if (skb_tailroom(skb) < TCPOLEN_MSS) { if (pskb_expand_head(skb, 0, TCPOLEN_MSS - skb_tailroom(skb), GFP_ATOMIC)) return -1; tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); } skb_put(skb, TCPOLEN_MSS); /* * IPv4: RFC 1122 states "If an MSS option is not received at * connection setup, TCP MUST assume a default send MSS of 536". * IPv6: RFC 2460 states IPv6 has a minimum MTU of 1280 and a minimum * length IPv6 header of 60, ergo the default MSS value is 1220 * Since no MSS was provided, we must use the default values */ if (xt_family(par) == NFPROTO_IPV4) newmss = min(newmss, (u16)536); else newmss = min(newmss, (u16)1220); opt = (u_int8_t *)tcph + sizeof(struct tcphdr); memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr)); inet_proto_csum_replace2(&tcph->check, skb, htons(len), htons(len + TCPOLEN_MSS), true); opt[0] = TCPOPT_MSS; opt[1] = TCPOLEN_MSS; opt[2] = (newmss & 0xff00) >> 8; opt[3] = newmss & 0x00ff; inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), false); oldval = ((__be16 *)tcph)[6]; tcph->doff += TCPOLEN_MSS/4; inet_proto_csum_replace2(&tcph->check, skb, oldval, ((__be16 *)tcph)[6], false); return TCPOLEN_MSS; } static unsigned int tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par) { struct iphdr *iph = ip_hdr(skb); __be16 newlen; int ret; ret = tcpmss_mangle_packet(skb, par, PF_INET, iph->ihl * 4, sizeof(*iph) + sizeof(struct tcphdr)); if (ret < 0) return NF_DROP; if (ret > 0) { iph = ip_hdr(skb); newlen = htons(ntohs(iph->tot_len) + ret); csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; } return XT_CONTINUE; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); u8 nexthdr; __be16 frag_off, oldlen, newlen; int tcphoff; int ret; nexthdr = ipv6h->nexthdr; tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off); if (tcphoff < 0) return NF_DROP; ret = tcpmss_mangle_packet(skb, par, PF_INET6, tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); if (ret < 0) return NF_DROP; if (ret > 0) { ipv6h = ipv6_hdr(skb); oldlen = ipv6h->payload_len; newlen = htons(ntohs(oldlen) + ret); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)oldlen), (__force __wsum)newlen); ipv6h->payload_len = newlen; } return XT_CONTINUE; } #endif /* Must specify -p tcp --syn */ static inline bool find_syn_match(const struct xt_entry_match *m) { const struct xt_tcp *tcpinfo = (const struct xt_tcp *)m->data; if (strcmp(m->u.kernel.match->name, "tcp") == 0 && tcpinfo->flg_cmp & TCPHDR_SYN && !(tcpinfo->invflags & XT_TCP_INV_FLAGS)) return true; return false; } static int tcpmss_tg4_check(const struct xt_tgchk_param *par) { const struct xt_tcpmss_info *info = par->targinfo; const struct ipt_entry *e = par->entryinfo; const struct xt_entry_match *ematch; if (info->mss == XT_TCPMSS_CLAMP_PMTU && (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } if (par->nft_compat) return 0; xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; pr_info_ratelimited("Only works on TCP SYN packets\n"); return -EINVAL; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static int tcpmss_tg6_check(const struct xt_tgchk_param *par) { const struct xt_tcpmss_info *info = par->targinfo; const struct ip6t_entry *e = par->entryinfo; const struct xt_entry_match *ematch; if (info->mss == XT_TCPMSS_CLAMP_PMTU && (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } if (par->nft_compat) return 0; xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; pr_info_ratelimited("Only works on TCP SYN packets\n"); return -EINVAL; } #endif static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = NFPROTO_IPV4, .name = "TCPMSS", .checkentry = tcpmss_tg4_check, .target = tcpmss_tg4, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .family = NFPROTO_IPV6, .name = "TCPMSS", .checkentry = tcpmss_tg6_check, .target = tcpmss_tg6, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, #endif }; static int __init tcpmss_tg_init(void) { return xt_register_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } static void __exit tcpmss_tg_exit(void) { xt_unregister_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } module_init(tcpmss_tg_init); module_exit(tcpmss_tg_exit);
3 3 2 2 20 4 11 5 12 1 3 3 3 21 1 21 2 169 19 68 31 31 31 31 31 31 31 31 31 31 31 31 31 5 3 12 3 8 25 30 30 2 26 25 3 1 5 5 13 4 1 1 1 1 1 8 1 1 1 1 7 4 7 125 1 12 105 2 2 5 7 7 7 7 5 7 7 1 1 1 1 1 1 1 1 35 35 26 26 32 32 2 2 29 1 27 27 14 22 1 26 28 28 20 4 17 7 9 21 3 3 1 1 1 1 3 3 3 3 8 2 11 13 12 13 12 5 12 3 56 13 13 13 13 13 13 13 13 13 13 13 13 12 13 8 6 5 6 6 5 1 3 4 3 35 36 36 32 32 40 41 68 1 24 55 14 14 2 5 5 5 36 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 // SPDX-License-Identifier: GPL-2.0 /* * Wireless utility functions * * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH * Copyright (C) 2018-2023 Intel Corporation */ #include <linux/export.h> #include <linux/bitops.h> #include <linux/etherdevice.h> #include <linux/slab.h> #include <linux/ieee80211.h> #include <net/cfg80211.h> #include <net/ip.h> #include <net/dsfield.h> #include <linux/if_vlan.h> #include <linux/mpls.h> #include <linux/gcd.h> #include <linux/bitfield.h> #include <linux/nospec.h> #include "core.h" #include "rdev-ops.h" const struct ieee80211_rate * ieee80211_get_response_rate(struct ieee80211_supported_band *sband, u32 basic_rates, int bitrate) { struct ieee80211_rate *result = &sband->bitrates[0]; int i; for (i = 0; i < sband->n_bitrates; i++) { if (!(basic_rates & BIT(i))) continue; if (sband->bitrates[i].bitrate > bitrate) continue; result = &sband->bitrates[i]; } return result; } EXPORT_SYMBOL(ieee80211_get_response_rate); u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband) { struct ieee80211_rate *bitrates; u32 mandatory_rates = 0; enum ieee80211_rate_flags mandatory_flag; int i; if (WARN_ON(!sband)) return 1; if (sband->band == NL80211_BAND_2GHZ) mandatory_flag = IEEE80211_RATE_MANDATORY_B; else mandatory_flag = IEEE80211_RATE_MANDATORY_A; bitrates = sband->bitrates; for (i = 0; i < sband->n_bitrates; i++) if (bitrates[i].flags & mandatory_flag) mandatory_rates |= BIT(i); return mandatory_rates; } EXPORT_SYMBOL(ieee80211_mandatory_rates); u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) { /* see 802.11 17.3.8.3.2 and Annex J * there are overlapping channel numbers in 5GHz and 2GHz bands */ if (chan <= 0) return 0; /* not supported */ switch (band) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: if (chan == 14) return MHZ_TO_KHZ(2484); else if (chan < 14) return MHZ_TO_KHZ(2407 + chan * 5); break; case NL80211_BAND_5GHZ: if (chan >= 182 && chan <= 196) return MHZ_TO_KHZ(4000 + chan * 5); else return MHZ_TO_KHZ(5000 + chan * 5); break; case NL80211_BAND_6GHZ: /* see 802.11ax D6.1 27.3.23.2 */ if (chan == 2) return MHZ_TO_KHZ(5935); if (chan <= 233) return MHZ_TO_KHZ(5950 + chan * 5); break; case NL80211_BAND_60GHZ: if (chan < 7) return MHZ_TO_KHZ(56160 + chan * 2160); break; case NL80211_BAND_S1GHZ: return 902000 + chan * 500; default: ; } return 0; /* not supported */ } EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); enum nl80211_chan_width ieee80211_s1g_channel_width(const struct ieee80211_channel *chan) { if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ)) return NL80211_CHAN_WIDTH_20_NOHT; /*S1G defines a single allowed channel width per channel. * Extract that width here. */ if (chan->flags & IEEE80211_CHAN_1MHZ) return NL80211_CHAN_WIDTH_1; else if (chan->flags & IEEE80211_CHAN_2MHZ) return NL80211_CHAN_WIDTH_2; else if (chan->flags & IEEE80211_CHAN_4MHZ) return NL80211_CHAN_WIDTH_4; else if (chan->flags & IEEE80211_CHAN_8MHZ) return NL80211_CHAN_WIDTH_8; else if (chan->flags & IEEE80211_CHAN_16MHZ) return NL80211_CHAN_WIDTH_16; pr_err("unknown channel width for channel at %dKHz?\n", ieee80211_channel_to_khz(chan)); return NL80211_CHAN_WIDTH_1; } EXPORT_SYMBOL(ieee80211_s1g_channel_width); int ieee80211_freq_khz_to_channel(u32 freq) { /* TODO: just handle MHz for now */ freq = KHZ_TO_MHZ(freq); /* see 802.11 17.3.8.3.2 and Annex J */ if (freq == 2484) return 14; else if (freq < 2484) return (freq - 2407) / 5; else if (freq >= 4910 && freq <= 4980) return (freq - 4000) / 5; else if (freq < 5925) return (freq - 5000) / 5; else if (freq == 5935) return 2; else if (freq <= 45000) /* DMG band lower limit */ /* see 802.11ax D6.1 27.3.22.2 */ return (freq - 5950) / 5; else if (freq >= 58320 && freq <= 70200) return (freq - 56160) / 2160; else return 0; } EXPORT_SYMBOL(ieee80211_freq_khz_to_channel); struct ieee80211_channel *ieee80211_get_channel_khz(struct wiphy *wiphy, u32 freq) { enum nl80211_band band; struct ieee80211_supported_band *sband; int i; for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { struct ieee80211_channel *chan = &sband->channels[i]; if (ieee80211_channel_to_khz(chan) == freq) return chan; } } return NULL; } EXPORT_SYMBOL(ieee80211_get_channel_khz); static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) { int i, want; switch (sband->band) { case NL80211_BAND_5GHZ: case NL80211_BAND_6GHZ: want = 3; for (i = 0; i < sband->n_bitrates; i++) { if (sband->bitrates[i].bitrate == 60 || sband->bitrates[i].bitrate == 120 || sband->bitrates[i].bitrate == 240) { sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_A; want--; } } WARN_ON(want); break; case NL80211_BAND_2GHZ: case NL80211_BAND_LC: want = 7; for (i = 0; i < sband->n_bitrates; i++) { switch (sband->bitrates[i].bitrate) { case 10: case 20: case 55: case 110: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; want--; break; case 60: case 120: case 240: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_G; want--; fallthrough; default: sband->bitrates[i].flags |= IEEE80211_RATE_ERP_G; break; } } WARN_ON(want != 0 && want != 3); break; case NL80211_BAND_60GHZ: /* check for mandatory HT MCS 1..4 */ WARN_ON(!sband->ht_cap.ht_supported); WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; case NL80211_BAND_S1GHZ: /* Figure 9-589bd: 3 means unsupported, so != 3 means at least * mandatory is ok. */ WARN_ON((sband->s1g_cap.nss_mcs[0] & 0x3) == 0x3); break; case NUM_NL80211_BANDS: default: WARN_ON(1); break; } } void ieee80211_set_bitrate_flags(struct wiphy *wiphy) { enum nl80211_band band; for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) set_mandatory_flags_band(wiphy->bands[band]); } bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher) { int i; for (i = 0; i < wiphy->n_cipher_suites; i++) if (cipher == wiphy->cipher_suites[i]) return true; return false; } static bool cfg80211_igtk_cipher_supported(struct cfg80211_registered_device *rdev) { struct wiphy *wiphy = &rdev->wiphy; int i; for (i = 0; i < wiphy->n_cipher_suites; i++) { switch (wiphy->cipher_suites[i]) { case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: return true; } } return false; } bool cfg80211_valid_key_idx(struct cfg80211_registered_device *rdev, int key_idx, bool pairwise) { int max_key_idx; if (pairwise) max_key_idx = 3; else if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION) || wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; else if (cfg80211_igtk_cipher_supported(rdev)) max_key_idx = 5; else max_key_idx = 3; if (key_idx < 0 || key_idx > max_key_idx) return false; return true; } int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr) { if (!cfg80211_valid_key_idx(rdev, key_idx, pairwise)) return -EINVAL; if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) return -EINVAL; if (pairwise && !mac_addr) return -EINVAL; switch (params->cipher) { case WLAN_CIPHER_SUITE_TKIP: /* Extended Key ID can only be used with CCMP/GCMP ciphers */ if ((pairwise && key_idx) || params->mode != NL80211_KEY_RX_TX) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: /* IEEE802.11-2016 allows only 0 and - when supporting * Extended Key ID - 1 as index for pairwise keys. * @NL80211_KEY_NO_TX is only allowed for pairwise keys when * the driver supports Extended Key ID. * @NL80211_KEY_SET_TX can't be set when installing and * validating a key. */ if ((params->mode == NL80211_KEY_NO_TX && !pairwise) || params->mode == NL80211_KEY_SET_TX) return -EINVAL; if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_EXT_KEY_ID)) { if (pairwise && (key_idx < 0 || key_idx > 1)) return -EINVAL; } else if (pairwise && key_idx) { return -EINVAL; } break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: /* Disallow BIP (group-only) cipher as pairwise cipher */ if (pairwise) return -EINVAL; if (key_idx < 4) return -EINVAL; break; case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: if (key_idx > 3) return -EINVAL; break; default: break; } switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: if (params->key_len != WLAN_KEY_LEN_WEP40) return -EINVAL; break; case WLAN_CIPHER_SUITE_TKIP: if (params->key_len != WLAN_KEY_LEN_TKIP) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP: if (params->key_len != WLAN_KEY_LEN_CCMP) return -EINVAL; break; case WLAN_CIPHER_SUITE_CCMP_256: if (params->key_len != WLAN_KEY_LEN_CCMP_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_GCMP: if (params->key_len != WLAN_KEY_LEN_GCMP) return -EINVAL; break; case WLAN_CIPHER_SUITE_GCMP_256: if (params->key_len != WLAN_KEY_LEN_GCMP_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_WEP104: if (params->key_len != WLAN_KEY_LEN_WEP104) return -EINVAL; break; case WLAN_CIPHER_SUITE_AES_CMAC: if (params->key_len != WLAN_KEY_LEN_AES_CMAC) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_CMAC_256: if (params->key_len != WLAN_KEY_LEN_BIP_CMAC_256) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_128) return -EINVAL; break; case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_256) return -EINVAL; break; default: /* * We don't know anything about this algorithm, * allow using it -- but the driver must check * all parameters! We still check below whether * or not the driver supports this algorithm, * of course. */ break; } if (params->seq) { switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: /* These ciphers do not use key sequence */ return -EINVAL; case WLAN_CIPHER_SUITE_TKIP: case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: if (params->seq_len != 6) return -EINVAL; break; } } if (!cfg80211_supported_cipher_suite(&rdev->wiphy, params->cipher)) return -EINVAL; return 0; } unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc) { unsigned int hdrlen = 24; if (ieee80211_is_ext(fc)) { hdrlen = 4; goto out; } if (ieee80211_is_data(fc)) { if (ieee80211_has_a4(fc)) hdrlen = 30; if (ieee80211_is_data_qos(fc)) { hdrlen += IEEE80211_QOS_CTL_LEN; if (ieee80211_has_order(fc)) hdrlen += IEEE80211_HT_CTL_LEN; } goto out; } if (ieee80211_is_mgmt(fc)) { if (ieee80211_has_order(fc)) hdrlen += IEEE80211_HT_CTL_LEN; goto out; } if (ieee80211_is_ctl(fc)) { /* * ACK and CTS are 10 bytes, all others 16. To see how * to get this condition consider * subtype mask: 0b0000000011110000 (0x00F0) * ACK subtype: 0b0000000011010000 (0x00D0) * CTS subtype: 0b0000000011000000 (0x00C0) * bits that matter: ^^^ (0x00E0) * value of those: 0b0000000011000000 (0x00C0) */ if ((fc & cpu_to_le16(0x00E0)) == cpu_to_le16(0x00C0)) hdrlen = 10; else hdrlen = 16; } out: return hdrlen; } EXPORT_SYMBOL(ieee80211_hdrlen); unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb) { const struct ieee80211_hdr *hdr = (const struct ieee80211_hdr *)skb->data; unsigned int hdrlen; if (unlikely(skb->len < 10)) return 0; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (unlikely(hdrlen > skb->len)) return 0; return hdrlen; } EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb); static unsigned int __ieee80211_get_mesh_hdrlen(u8 flags) { int ae = flags & MESH_FLAGS_AE; /* 802.11-2012, 8.2.4.7.3 */ switch (ae) { default: case 0: return 6; case MESH_FLAGS_AE_A4: return 12; case MESH_FLAGS_AE_A5_A6: return 18; } } unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) { return __ieee80211_get_mesh_hdrlen(meshhdr->flags); } EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto) { const __be16 *hdr_proto = hdr + ETH_ALEN; if (!(ether_addr_equal(hdr, rfc1042_header) && *hdr_proto != htons(ETH_P_AARP) && *hdr_proto != htons(ETH_P_IPX)) && !ether_addr_equal(hdr, bridge_tunnel_header)) return false; *proto = *hdr_proto; return true; } EXPORT_SYMBOL(ieee80211_get_8023_tunnel_proto); int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb) { const void *mesh_addr; struct { struct ethhdr eth; u8 flags; } payload; int hdrlen; int ret; ret = skb_copy_bits(skb, 0, &payload, sizeof(payload)); if (ret) return ret; hdrlen = sizeof(payload.eth) + __ieee80211_get_mesh_hdrlen(payload.flags); if (likely(pskb_may_pull(skb, hdrlen + 8) && ieee80211_get_8023_tunnel_proto(skb->data + hdrlen, &payload.eth.h_proto))) hdrlen += ETH_ALEN + 2; else if (!pskb_may_pull(skb, hdrlen)) return -EINVAL; else payload.eth.h_proto = htons(skb->len - hdrlen); mesh_addr = skb->data + sizeof(payload.eth) + ETH_ALEN; switch (payload.flags & MESH_FLAGS_AE) { case MESH_FLAGS_AE_A4: memcpy(&payload.eth.h_source, mesh_addr, ETH_ALEN); break; case MESH_FLAGS_AE_A5_A6: memcpy(&payload.eth, mesh_addr, 2 * ETH_ALEN); break; default: break; } pskb_pull(skb, hdrlen - sizeof(payload.eth)); memcpy(skb->data, &payload.eth, sizeof(payload.eth)); return 0; } EXPORT_SYMBOL(ieee80211_strip_8023_mesh_hdr); int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, const u8 *addr, enum nl80211_iftype iftype, u8 data_offset, bool is_amsdu) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct { u8 hdr[ETH_ALEN] __aligned(2); __be16 proto; } payload; struct ethhdr tmp; u16 hdrlen; if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control) + data_offset; if (skb->len < hdrlen) return -1; /* convert IEEE 802.11 header + possible LLC headers into Ethernet * header * IEEE 802.11 address fields: * ToDS FromDS Addr1 Addr2 Addr3 Addr4 * 0 0 DA SA BSSID n/a * 0 1 DA BSSID SA n/a * 1 0 BSSID SA DA n/a * 1 1 RA TA DA SA */ memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN); memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN); switch (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { case cpu_to_le16(IEEE80211_FCTL_TODS): if (unlikely(iftype != NL80211_IFTYPE_AP && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_P2P_GO)) return -1; break; case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): if (unlikely(iftype != NL80211_IFTYPE_MESH_POINT && iftype != NL80211_IFTYPE_AP_VLAN && iftype != NL80211_IFTYPE_STATION)) return -1; break; case cpu_to_le16(IEEE80211_FCTL_FROMDS): if ((iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_P2P_CLIENT && iftype != NL80211_IFTYPE_MESH_POINT) || (is_multicast_ether_addr(tmp.h_dest) && ether_addr_equal(tmp.h_source, addr))) return -1; break; case cpu_to_le16(0): if (iftype != NL80211_IFTYPE_ADHOC && iftype != NL80211_IFTYPE_STATION && iftype != NL80211_IFTYPE_OCB) return -1; break; } if (likely(!is_amsdu && iftype != NL80211_IFTYPE_MESH_POINT && skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)) == 0 && ieee80211_get_8023_tunnel_proto(&payload, &tmp.h_proto))) { /* remove RFC1042 or Bridge-Tunnel encapsulation */ hdrlen += ETH_ALEN + 2; skb_postpull_rcsum(skb, &payload, ETH_ALEN + 2); } else { tmp.h_proto = htons(skb->len - hdrlen); } pskb_pull(skb, hdrlen); if (!ehdr) ehdr = skb_push(skb, sizeof(struct ethhdr)); memcpy(ehdr, &tmp, sizeof(tmp)); return 0; } EXPORT_SYMBOL(ieee80211_data_to_8023_exthdr); static void __frame_add_frag(struct sk_buff *skb, struct page *page, void *ptr, int len, int size) { struct skb_shared_info *sh = skb_shinfo(skb); int page_offset; get_page(page); page_offset = ptr - page_address(page); skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size); } static void __ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame, int offset, int len) { struct skb_shared_info *sh = skb_shinfo(skb); const skb_frag_t *frag = &sh->frags[0]; struct page *frag_page; void *frag_ptr; int frag_len, frag_size; int head_size = skb->len - skb->data_len; int cur_len; frag_page = virt_to_head_page(skb->head); frag_ptr = skb->data; frag_size = head_size; while (offset >= frag_size) { offset -= frag_size; frag_page = skb_frag_page(frag); frag_ptr = skb_frag_address(frag); frag_size = skb_frag_size(frag); frag++; } frag_ptr += offset; frag_len = frag_size - offset; cur_len = min(len, frag_len); __frame_add_frag(frame, frag_page, frag_ptr, cur_len, frag_size); len -= cur_len; while (len > 0) { frag_len = skb_frag_size(frag); cur_len = min(len, frag_len); __frame_add_frag(frame, skb_frag_page(frag), skb_frag_address(frag), cur_len, frag_len); len -= cur_len; frag++; } } static struct sk_buff * __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, int offset, int len, bool reuse_frag, int min_len) { struct sk_buff *frame; int cur_len = len; if (skb->len - offset < len) return NULL; /* * When reusing fragments, copy some data to the head to simplify * ethernet header handling and speed up protocol header processing * in the stack later. */ if (reuse_frag) cur_len = min_t(int, len, min_len); /* * Allocate and reserve two bytes more for payload * alignment since sizeof(struct ethhdr) is 14. */ frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len); if (!frame) return NULL; frame->priority = skb->priority; skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len); len -= cur_len; if (!len) return frame; offset += cur_len; __ieee80211_amsdu_copy_frag(skb, frame, offset, len); return frame; } static u16 ieee80211_amsdu_subframe_length(void *field, u8 mesh_flags, u8 hdr_type) { __le16 *field_le = field; __be16 *field_be = field; u16 len; if (hdr_type >= 2) len = le16_to_cpu(*field_le); else len = be16_to_cpu(*field_be); if (hdr_type) len += __ieee80211_get_mesh_hdrlen(mesh_flags); return len; } bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr) { int offset = 0, subframe_len, padding; for (offset = 0; offset < skb->len; offset += subframe_len + padding) { int remaining = skb->len - offset; struct { __be16 len; u8 mesh_flags; } hdr; u16 len; if (sizeof(hdr) > remaining) return false; if (skb_copy_bits(skb, offset + 2 * ETH_ALEN, &hdr, sizeof(hdr)) < 0) return false; len = ieee80211_amsdu_subframe_length(&hdr.len, hdr.mesh_flags, mesh_hdr); subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; if (subframe_len > remaining) return false; } return true; } EXPORT_SYMBOL(ieee80211_is_valid_amsdu); void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list, const u8 *addr, enum nl80211_iftype iftype, const unsigned int extra_headroom, const u8 *check_da, const u8 *check_sa, u8 mesh_control) { unsigned int hlen = ALIGN(extra_headroom, 4); struct sk_buff *frame = NULL; int offset = 0; struct { struct ethhdr eth; uint8_t flags; } hdr; bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb); bool reuse_skb = false; bool last = false; int copy_len = sizeof(hdr.eth); if (iftype == NL80211_IFTYPE_MESH_POINT) copy_len = sizeof(hdr); while (!last) { int remaining = skb->len - offset; unsigned int subframe_len; int len, mesh_len = 0; u8 padding; if (copy_len > remaining) goto purge; skb_copy_bits(skb, offset, &hdr, copy_len); if (iftype == NL80211_IFTYPE_MESH_POINT) mesh_len = __ieee80211_get_mesh_hdrlen(hdr.flags); len = ieee80211_amsdu_subframe_length(&hdr.eth.h_proto, hdr.flags, mesh_control); subframe_len = sizeof(struct ethhdr) + len; padding = (4 - subframe_len) & 0x3; /* the last MSDU has no padding */ if (subframe_len > remaining) goto purge; /* mitigate A-MSDU aggregation injection attacks */ if (ether_addr_equal(hdr.eth.h_dest, rfc1042_header)) goto purge; offset += sizeof(struct ethhdr); last = remaining <= subframe_len + padding; /* FIXME: should we really accept multicast DA? */ if ((check_da && !is_multicast_ether_addr(hdr.eth.h_dest) && !ether_addr_equal(check_da, hdr.eth.h_dest)) || (check_sa && !ether_addr_equal(check_sa, hdr.eth.h_source))) { offset += len + padding; continue; } /* reuse skb for the last subframe */ if (!skb_is_nonlinear(skb) && !reuse_frag && last) { skb_pull(skb, offset); frame = skb; reuse_skb = true; } else { frame = __ieee80211_amsdu_copy(skb, hlen, offset, len, reuse_frag, 32 + mesh_len); if (!frame) goto purge; offset += len + padding; } skb_reset_network_header(frame); frame->dev = skb->dev; frame->priority = skb->priority; if (likely(iftype != NL80211_IFTYPE_MESH_POINT && ieee80211_get_8023_tunnel_proto(frame->data, &hdr.eth.h_proto))) skb_pull(frame, ETH_ALEN + 2); memcpy(skb_push(frame, sizeof(hdr.eth)), &hdr.eth, sizeof(hdr.eth)); __skb_queue_tail(list, frame); } if (!reuse_skb) dev_kfree_skb(skb); return; purge: __skb_queue_purge(list); dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_amsdu_to_8023s); /* Given a data frame determine the 802.1p/1d tag to use. */ unsigned int cfg80211_classify8021d(struct sk_buff *skb, struct cfg80211_qos_map *qos_map) { unsigned int dscp; unsigned char vlan_priority; unsigned int ret; /* skb->priority values from 256->263 are magic values to * directly indicate a specific 802.1d priority. This is used * to allow 802.1d priority to be passed directly in from VLAN * tags, etc. */ if (skb->priority >= 256 && skb->priority <= 263) { ret = skb->priority - 256; goto out; } if (skb_vlan_tag_present(skb)) { vlan_priority = (skb_vlan_tag_get(skb) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; if (vlan_priority > 0) { ret = vlan_priority; goto out; } } switch (skb->protocol) { case htons(ETH_P_IP): dscp = ipv4_get_dsfield(ip_hdr(skb)) & 0xfc; break; case htons(ETH_P_IPV6): dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & 0xfc; break; case htons(ETH_P_MPLS_UC): case htons(ETH_P_MPLS_MC): { struct mpls_label mpls_tmp, *mpls; mpls = skb_header_pointer(skb, sizeof(struct ethhdr), sizeof(*mpls), &mpls_tmp); if (!mpls) return 0; ret = (ntohl(mpls->entry) & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; goto out; } case htons(ETH_P_80221): /* 802.21 is always network control traffic */ return 7; default: return 0; } if (qos_map) { unsigned int i, tmp_dscp = dscp >> 2; for (i = 0; i < qos_map->num_des; i++) { if (tmp_dscp == qos_map->dscp_exception[i].dscp) { ret = qos_map->dscp_exception[i].up; goto out; } } for (i = 0; i < 8; i++) { if (tmp_dscp >= qos_map->up[i].low && tmp_dscp <= qos_map->up[i].high) { ret = i; goto out; } } } /* The default mapping as defined Section 2.3 in RFC8325: The three * Most Significant Bits (MSBs) of the DSCP are used as the * corresponding L2 markings. */ ret = dscp >> 5; /* Handle specific DSCP values for which the default mapping (as * described above) doesn't adhere to the intended usage of the DSCP * value. See section 4 in RFC8325. Specifically, for the following * Diffserv Service Classes no update is needed: * - Standard: DF * - Low Priority Data: CS1 * - Multimedia Conferencing: AF41, AF42, AF43 * - Network Control Traffic: CS7 * - Real-Time Interactive: CS4 * - Signaling: CS5 */ switch (dscp >> 2) { case 10: case 12: case 14: /* High throughput data: AF11, AF12, AF13 */ ret = 0; break; case 16: /* Operations, Administration, and Maintenance and Provisioning: * CS2 */ ret = 0; break; case 18: case 20: case 22: /* Low latency data: AF21, AF22, AF23 */ ret = 3; break; case 24: /* Broadcasting video: CS3 */ ret = 4; break; case 26: case 28: case 30: /* Multimedia Streaming: AF31, AF32, AF33 */ ret = 4; break; case 44: /* Voice Admit: VA */ ret = 6; break; case 46: /* Telephony traffic: EF */ ret = 6; break; case 48: /* Network Control Traffic: CS6 */ ret = 7; break; } out: return array_index_nospec(ret, IEEE80211_NUM_TIDS); } EXPORT_SYMBOL(cfg80211_classify8021d); const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id) { const struct cfg80211_bss_ies *ies; ies = rcu_dereference(bss->ies); if (!ies) return NULL; return cfg80211_find_elem(id, ies->data, ies->len); } EXPORT_SYMBOL(ieee80211_bss_get_elem); void cfg80211_upload_connect_keys(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct net_device *dev = wdev->netdev; int i; if (!wdev->connect_keys) return; for (i = 0; i < 4; i++) { if (!wdev->connect_keys->params[i].cipher) continue; if (rdev_add_key(rdev, dev, -1, i, false, NULL, &wdev->connect_keys->params[i])) { netdev_err(dev, "failed to set key %d\n", i); continue; } if (wdev->connect_keys->def == i && rdev_set_default_key(rdev, dev, -1, i, true, true)) { netdev_err(dev, "failed to set defkey %d\n", i); continue; } } kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; } void cfg80211_process_wdev_events(struct wireless_dev *wdev) { struct cfg80211_event *ev; unsigned long flags; spin_lock_irqsave(&wdev->event_lock, flags); while (!list_empty(&wdev->event_list)) { ev = list_first_entry(&wdev->event_list, struct cfg80211_event, list); list_del(&ev->list); spin_unlock_irqrestore(&wdev->event_lock, flags); switch (ev->type) { case EVENT_CONNECT_RESULT: __cfg80211_connect_result( wdev->netdev, &ev->cr, ev->cr.status == WLAN_STATUS_SUCCESS); break; case EVENT_ROAMED: __cfg80211_roamed(wdev, &ev->rm); break; case EVENT_DISCONNECTED: __cfg80211_disconnected(wdev->netdev, ev->dc.ie, ev->dc.ie_len, ev->dc.reason, !ev->dc.locally_generated); break; case EVENT_IBSS_JOINED: __cfg80211_ibss_joined(wdev->netdev, ev->ij.bssid, ev->ij.channel); break; case EVENT_STOPPED: cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); break; case EVENT_PORT_AUTHORIZED: __cfg80211_port_authorized(wdev, ev->pa.peer_addr, ev->pa.td_bitmap, ev->pa.td_bitmap_len); break; } kfree(ev); spin_lock_irqsave(&wdev->event_lock, flags); } spin_unlock_irqrestore(&wdev->event_lock, flags); } void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; lockdep_assert_held(&rdev->wiphy.mtx); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_process_wdev_events(wdev); } int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, struct vif_params *params) { int err; enum nl80211_iftype otype = dev->ieee80211_ptr->iftype; lockdep_assert_held(&rdev->wiphy.mtx); /* don't support changing VLANs, you just re-create them */ if (otype == NL80211_IFTYPE_AP_VLAN) return -EOPNOTSUPP; /* cannot change into P2P device or NAN */ if (ntype == NL80211_IFTYPE_P2P_DEVICE || ntype == NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!rdev->ops->change_virtual_intf || !(rdev->wiphy.interface_modes & (1 << ntype))) return -EOPNOTSUPP; if (ntype != otype) { /* if it's part of a bridge, reject changing type to station/ibss */ if (netif_is_bridge_port(dev) && (ntype == NL80211_IFTYPE_ADHOC || ntype == NL80211_IFTYPE_STATION || ntype == NL80211_IFTYPE_P2P_CLIENT)) return -EBUSY; dev->ieee80211_ptr->use_4addr = false; rdev_set_qos_map(rdev, dev, NULL); switch (otype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, dev, -1, true); break; case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, dev, false); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, true); break; case NL80211_IFTYPE_MESH_POINT: /* mesh should be handled? */ break; case NL80211_IFTYPE_OCB: cfg80211_leave_ocb(rdev, dev); break; default: break; } cfg80211_process_rdev_events(rdev); cfg80211_mlme_purge_registrations(dev->ieee80211_ptr); memset(&dev->ieee80211_ptr->u, 0, sizeof(dev->ieee80211_ptr->u)); memset(&dev->ieee80211_ptr->links, 0, sizeof(dev->ieee80211_ptr->links)); } err = rdev_change_virtual_intf(rdev, dev, ntype, params); WARN_ON(!err && dev->ieee80211_ptr->iftype != ntype); if (!err && params && params->use_4addr != -1) dev->ieee80211_ptr->use_4addr = params->use_4addr; if (!err) { dev->priv_flags &= ~IFF_DONT_BRIDGE; switch (ntype) { case NL80211_IFTYPE_STATION: if (dev->ieee80211_ptr->use_4addr) break; fallthrough; case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_ADHOC: dev->priv_flags |= IFF_DONT_BRIDGE; break; case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MESH_POINT: /* bridging OK */ break; case NL80211_IFTYPE_MONITOR: /* monitor can't bridge anyway */ break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: /* not happening */ break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_NAN: WARN_ON(1); break; } } if (!err && ntype != otype && netif_running(dev)) { cfg80211_update_iface_num(rdev, ntype, 1); cfg80211_update_iface_num(rdev, otype, -1); } return err; } static u32 cfg80211_calculate_bitrate_ht(struct rate_info *rate) { int modulation, streams, bitrate; /* the formula below does only work for MCS values smaller than 32 */ if (WARN_ON_ONCE(rate->mcs >= 32)) return 0; modulation = rate->mcs & 7; streams = (rate->mcs >> 3) + 1; bitrate = (rate->bw == RATE_INFO_BW_40) ? 13500000 : 6500000; if (modulation < 4) bitrate *= (modulation + 1); else if (modulation == 4) bitrate *= (modulation + 2); else bitrate *= (modulation + 3); bitrate *= streams; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; } static u32 cfg80211_calculate_bitrate_dmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { /* control PHY */ [0] = 275, /* SC PHY */ [1] = 3850, [2] = 7700, [3] = 9625, [4] = 11550, [5] = 12512, /* 1251.25 mbps */ [6] = 15400, [7] = 19250, [8] = 23100, [9] = 25025, [10] = 30800, [11] = 38500, [12] = 46200, /* OFDM PHY */ [13] = 6930, [14] = 8662, /* 866.25 mbps */ [15] = 13860, [16] = 17325, [17] = 20790, [18] = 27720, [19] = 34650, [20] = 41580, [21] = 45045, [22] = 51975, [23] = 62370, [24] = 67568, /* 6756.75 mbps */ /* LP-SC PHY */ [25] = 6260, [26] = 8340, [27] = 11120, [28] = 12510, [29] = 16680, [30] = 22240, [31] = 25030, }; if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate))) return 0; return __mcs2bitrate[rate->mcs]; } static u32 cfg80211_calculate_bitrate_extended_sc_dmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { [6 - 6] = 26950, /* MCS 9.1 : 2695.0 mbps */ [7 - 6] = 50050, /* MCS 12.1 */ [8 - 6] = 53900, [9 - 6] = 57750, [10 - 6] = 63900, [11 - 6] = 75075, [12 - 6] = 80850, }; /* Extended SC MCS not defined for base MCS below 6 or above 12 */ if (WARN_ON_ONCE(rate->mcs < 6 || rate->mcs > 12)) return 0; return __mcs2bitrate[rate->mcs - 6]; } static u32 cfg80211_calculate_bitrate_edmg(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { /* control PHY */ [0] = 275, /* SC PHY */ [1] = 3850, [2] = 7700, [3] = 9625, [4] = 11550, [5] = 12512, /* 1251.25 mbps */ [6] = 13475, [7] = 15400, [8] = 19250, [9] = 23100, [10] = 25025, [11] = 26950, [12] = 30800, [13] = 38500, [14] = 46200, [15] = 50050, [16] = 53900, [17] = 57750, [18] = 69300, [19] = 75075, [20] = 80850, }; if (WARN_ON_ONCE(rate->mcs >= ARRAY_SIZE(__mcs2bitrate))) return 0; return __mcs2bitrate[rate->mcs] * rate->n_bonded_ch; } static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) { static const u32 base[4][12] = { { 6500000, 13000000, 19500000, 26000000, 39000000, 52000000, 58500000, 65000000, 78000000, /* not in the spec, but some devices use this: */ 86700000, 97500000, 108300000, }, { 13500000, 27000000, 40500000, 54000000, 81000000, 108000000, 121500000, 135000000, 162000000, 180000000, 202500000, 225000000, }, { 29300000, 58500000, 87800000, 117000000, 175500000, 234000000, 263300000, 292500000, 351000000, 390000000, 438800000, 487500000, }, { 58500000, 117000000, 175500000, 234000000, 351000000, 468000000, 526500000, 585000000, 702000000, 780000000, 877500000, 975000000, }, }; u32 bitrate; int idx; if (rate->mcs > 11) goto warn; switch (rate->bw) { case RATE_INFO_BW_160: idx = 3; break; case RATE_INFO_BW_80: idx = 2; break; case RATE_INFO_BW_40: idx = 1; break; case RATE_INFO_BW_5: case RATE_INFO_BW_10: default: goto warn; case RATE_INFO_BW_20: idx = 0; } bitrate = base[idx][rate->mcs]; bitrate *= rate->nss; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; warn: WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n", rate->bw, rate->mcs, rate->nss); return 0; } static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) { #define SCALE 6144 u32 mcs_divisors[14] = { 102399, /* 16.666666... */ 51201, /* 8.333333... */ 34134, /* 5.555555... */ 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ 6828, /* 1.111111... */ 6144, /* 1.000000... */ 5690, /* 0.926106... */ 5120, /* 0.833333... */ }; u32 rates_160M[3] = { 960777777, 907400000, 816666666 }; u32 rates_996[3] = { 480388888, 453700000, 408333333 }; u32 rates_484[3] = { 229411111, 216666666, 195000000 }; u32 rates_242[3] = { 114711111, 108333333, 97500000 }; u32 rates_106[3] = { 40000000, 37777777, 34000000 }; u32 rates_52[3] = { 18820000, 17777777, 16000000 }; u32 rates_26[3] = { 9411111, 8888888, 8000000 }; u64 tmp; u32 result; if (WARN_ON_ONCE(rate->mcs > 13)) return 0; if (WARN_ON_ONCE(rate->he_gi > NL80211_RATE_INFO_HE_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->he_ru_alloc > NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) return 0; if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) return 0; if (rate->bw == RATE_INFO_BW_160 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_2x996)) result = rates_160M[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_80 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_996)) result = rates_996[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_40 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_484)) result = rates_484[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_20 || (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_242)) result = rates_242[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_106) result = rates_106[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_52) result = rates_52[rate->he_gi]; else if (rate->bw == RATE_INFO_BW_HE_RU && rate->he_ru_alloc == NL80211_RATE_INFO_HE_RU_ALLOC_26) result = rates_26[rate->he_gi]; else { WARN(1, "invalid HE MCS: bw:%d, ru:%d\n", rate->bw, rate->he_ru_alloc); return 0; } /* now scale to the appropriate MCS */ tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); result = tmp; /* and take NSS, DCM into account */ result = (result * rate->nss) / 8; if (rate->he_dcm) result /= 2; return result / 10000; } static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) { #define SCALE 6144 static const u32 mcs_divisors[16] = { 102399, /* 16.666666... */ 51201, /* 8.333333... */ 34134, /* 5.555555... */ 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ 6828, /* 1.111111... */ 6144, /* 1.000000... */ 5690, /* 0.926106... */ 5120, /* 0.833333... */ 409600, /* 66.666666... */ 204800, /* 33.333333... */ }; static const u32 rates_996[3] = { 480388888, 453700000, 408333333 }; static const u32 rates_484[3] = { 229411111, 216666666, 195000000 }; static const u32 rates_242[3] = { 114711111, 108333333, 97500000 }; static const u32 rates_106[3] = { 40000000, 37777777, 34000000 }; static const u32 rates_52[3] = { 18820000, 17777777, 16000000 }; static const u32 rates_26[3] = { 9411111, 8888888, 8000000 }; u64 tmp; u32 result; if (WARN_ON_ONCE(rate->mcs > 15)) return 0; if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->eht_ru_alloc > NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) return 0; if (WARN_ON_ONCE(rate->nss < 1 || rate->nss > 8)) return 0; /* Bandwidth checks for MCS 14 */ if (rate->mcs == 14) { if ((rate->bw != RATE_INFO_BW_EHT_RU && rate->bw != RATE_INFO_BW_80 && rate->bw != RATE_INFO_BW_160 && rate->bw != RATE_INFO_BW_320) || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_996 && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_2x996 && rate->eht_ru_alloc != NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) { WARN(1, "invalid EHT BW for MCS 14: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } } if (rate->bw == RATE_INFO_BW_320 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_4x996)) result = 4 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484) result = 3 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_3x996) result = 3 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484) result = 2 * rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_160 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_2x996)) result = 2 * rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242) result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi] + rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996P484) result = rates_996[rate->eht_gi] + rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_80 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_996)) result = rates_996[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484P242) result = rates_484[rate->eht_gi] + rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_40 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_484)) result = rates_484[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_20 || (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_242)) result = rates_242[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106P26) result = rates_106[rate->eht_gi] + rates_26[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_106) result = rates_106[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52P26) result = rates_52[rate->eht_gi] + rates_26[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_52) result = rates_52[rate->eht_gi]; else if (rate->bw == RATE_INFO_BW_EHT_RU && rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26) result = rates_26[rate->eht_gi]; else { WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } /* now scale to the appropriate MCS */ tmp = result; tmp *= SCALE; do_div(tmp, mcs_divisors[rate->mcs]); /* and take NSS */ tmp *= rate->nss; do_div(tmp, 8); result = tmp; return result / 10000; } static u32 cfg80211_calculate_bitrate_s1g(struct rate_info *rate) { /* For 1, 2, 4, 8 and 16 MHz channels */ static const u32 base[5][11] = { { 300000, 600000, 900000, 1200000, 1800000, 2400000, 2700000, 3000000, 3600000, 4000000, /* MCS 10 supported in 1 MHz only */ 150000, }, { 650000, 1300000, 1950000, 2600000, 3900000, 5200000, 5850000, 6500000, 7800000, /* MCS 9 not valid */ }, { 1350000, 2700000, 4050000, 5400000, 8100000, 10800000, 12150000, 13500000, 16200000, 18000000, }, { 2925000, 5850000, 8775000, 11700000, 17550000, 23400000, 26325000, 29250000, 35100000, 39000000, }, { 8580000, 11700000, 17550000, 23400000, 35100000, 46800000, 52650000, 58500000, 70200000, 78000000, }, }; u32 bitrate; /* default is 1 MHz index */ int idx = 0; if (rate->mcs >= 11) goto warn; switch (rate->bw) { case RATE_INFO_BW_16: idx = 4; break; case RATE_INFO_BW_8: idx = 3; break; case RATE_INFO_BW_4: idx = 2; break; case RATE_INFO_BW_2: idx = 1; break; case RATE_INFO_BW_1: idx = 0; break; case RATE_INFO_BW_5: case RATE_INFO_BW_10: case RATE_INFO_BW_20: case RATE_INFO_BW_40: case RATE_INFO_BW_80: case RATE_INFO_BW_160: default: goto warn; } bitrate = base[idx][rate->mcs]; bitrate *= rate->nss; if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) bitrate = (bitrate / 9) * 10; /* do NOT round down here */ return (bitrate + 50000) / 100000; warn: WARN_ONCE(1, "invalid rate bw=%d, mcs=%d, nss=%d\n", rate->bw, rate->mcs, rate->nss); return 0; } u32 cfg80211_calculate_bitrate(struct rate_info *rate) { if (rate->flags & RATE_INFO_FLAGS_MCS) return cfg80211_calculate_bitrate_ht(rate); if (rate->flags & RATE_INFO_FLAGS_DMG) return cfg80211_calculate_bitrate_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EXTENDED_SC_DMG) return cfg80211_calculate_bitrate_extended_sc_dmg(rate); if (rate->flags & RATE_INFO_FLAGS_EDMG) return cfg80211_calculate_bitrate_edmg(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) return cfg80211_calculate_bitrate_vht(rate); if (rate->flags & RATE_INFO_FLAGS_HE_MCS) return cfg80211_calculate_bitrate_he(rate); if (rate->flags & RATE_INFO_FLAGS_EHT_MCS) return cfg80211_calculate_bitrate_eht(rate); if (rate->flags & RATE_INFO_FLAGS_S1G_MCS) return cfg80211_calculate_bitrate_s1g(rate); return rate->legacy; } EXPORT_SYMBOL(cfg80211_calculate_bitrate); int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, enum ieee80211_p2p_attr_id attr, u8 *buf, unsigned int bufsize) { u8 *out = buf; u16 attr_remaining = 0; bool desired_attr = false; u16 desired_len = 0; while (len > 0) { unsigned int iedatalen; unsigned int copy; const u8 *iedata; if (len < 2) return -EILSEQ; iedatalen = ies[1]; if (iedatalen + 2 > len) return -EILSEQ; if (ies[0] != WLAN_EID_VENDOR_SPECIFIC) goto cont; if (iedatalen < 4) goto cont; iedata = ies + 2; /* check WFA OUI, P2P subtype */ if (iedata[0] != 0x50 || iedata[1] != 0x6f || iedata[2] != 0x9a || iedata[3] != 0x09) goto cont; iedatalen -= 4; iedata += 4; /* check attribute continuation into this IE */ copy = min_t(unsigned int, attr_remaining, iedatalen); if (copy && desired_attr) { desired_len += copy; if (out) { memcpy(out, iedata, min(bufsize, copy)); out += min(bufsize, copy); bufsize -= min(bufsize, copy); } if (copy == attr_remaining) return desired_len; } attr_remaining -= copy; if (attr_remaining) goto cont; iedatalen -= copy; iedata += copy; while (iedatalen > 0) { u16 attr_len; /* P2P attribute ID & size must fit */ if (iedatalen < 3) return -EILSEQ; desired_attr = iedata[0] == attr; attr_len = get_unaligned_le16(iedata + 1); iedatalen -= 3; iedata += 3; copy = min_t(unsigned int, attr_len, iedatalen); if (desired_attr) { desired_len += copy; if (out) { memcpy(out, iedata, min(bufsize, copy)); out += min(bufsize, copy); bufsize -= min(bufsize, copy); } if (copy == attr_len) return desired_len; } iedata += copy; iedatalen -= copy; attr_remaining = attr_len - copy; } cont: len -= ies[1] + 2; ies += ies[1] + 2; } if (attr_remaining && desired_attr) return -EILSEQ; return -ENOENT; } EXPORT_SYMBOL(cfg80211_get_p2p_attr); static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id, bool id_ext) { int i; /* Make sure array values are legal */ if (WARN_ON(ids[n_ids - 1] == WLAN_EID_EXTENSION)) return false; i = 0; while (i < n_ids) { if (ids[i] == WLAN_EID_EXTENSION) { if (id_ext && (ids[i + 1] == id)) return true; i += 2; continue; } if (ids[i] == id && !id_ext) return true; i++; } return false; } static size_t skip_ie(const u8 *ies, size_t ielen, size_t pos) { /* we assume a validly formed IEs buffer */ u8 len = ies[pos + 1]; pos += 2 + len; /* the IE itself must have 255 bytes for fragments to follow */ if (len < 255) return pos; while (pos < ielen && ies[pos] == WLAN_EID_FRAGMENT) { len = ies[pos + 1]; pos += 2 + len; } return pos; } size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, const u8 *ids, int n_ids, const u8 *after_ric, int n_after_ric, size_t offset) { size_t pos = offset; while (pos < ielen) { u8 ext = 0; if (ies[pos] == WLAN_EID_EXTENSION) ext = 2; if ((pos + ext) >= ielen) break; if (!ieee80211_id_in_list(ids, n_ids, ies[pos + ext], ies[pos] == WLAN_EID_EXTENSION)) break; if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) { pos = skip_ie(ies, ielen, pos); while (pos < ielen) { if (ies[pos] == WLAN_EID_EXTENSION) ext = 2; else ext = 0; if ((pos + ext) >= ielen) break; if (!ieee80211_id_in_list(after_ric, n_after_ric, ies[pos + ext], ext == 2)) pos = skip_ie(ies, ielen, pos); else break; } } else { pos = skip_ie(ies, ielen, pos); } } return pos; } EXPORT_SYMBOL(ieee80211_ie_split_ric); void ieee80211_fragment_element(struct sk_buff *skb, u8 *len_pos, u8 frag_id) { unsigned int elem_len; if (!len_pos) return; elem_len = skb->data + skb->len - len_pos - 1; while (elem_len > 255) { /* this one is 255 */ *len_pos = 255; /* remaining data gets smaller */ elem_len -= 255; /* make space for the fragment ID/len in SKB */ skb_put(skb, 2); /* shift back the remaining data to place fragment ID/len */ memmove(len_pos + 255 + 3, len_pos + 255 + 1, elem_len); /* place the fragment ID */ len_pos += 255 + 1; *len_pos = frag_id; /* and point to fragment length to update later */ len_pos++; } *len_pos = elem_len; } EXPORT_SYMBOL(ieee80211_fragment_element); bool ieee80211_operating_class_to_band(u8 operating_class, enum nl80211_band *band) { switch (operating_class) { case 112: case 115 ... 127: case 128 ... 130: *band = NL80211_BAND_5GHZ; return true; case 131 ... 135: case 137: *band = NL80211_BAND_6GHZ; return true; case 81: case 82: case 83: case 84: *band = NL80211_BAND_2GHZ; return true; case 180: *band = NL80211_BAND_60GHZ; return true; } return false; } EXPORT_SYMBOL(ieee80211_operating_class_to_band); bool ieee80211_operating_class_to_chandef(u8 operating_class, struct ieee80211_channel *chan, struct cfg80211_chan_def *chandef) { u32 control_freq, offset = 0; enum nl80211_band band; if (!ieee80211_operating_class_to_band(operating_class, &band) || !chan || band != chan->band) return false; control_freq = chan->center_freq; chandef->chan = chan; if (control_freq >= 5955) offset = control_freq - 5955; else if (control_freq >= 5745) offset = control_freq - 5745; else if (control_freq >= 5180) offset = control_freq - 5180; offset /= 20; switch (operating_class) { case 81: /* 2 GHz band; 20 MHz; channels 1..13 */ case 82: /* 2 GHz band; 20 MHz; channel 14 */ case 115: /* 5 GHz band; 20 MHz; channels 36,40,44,48 */ case 118: /* 5 GHz band; 20 MHz; channels 52,56,60,64 */ case 121: /* 5 GHz band; 20 MHz; channels 100..144 */ case 124: /* 5 GHz band; 20 MHz; channels 149,153,157,161 */ case 125: /* 5 GHz band; 20 MHz; channels 149..177 */ case 131: /* 6 GHz band; 20 MHz; channels 1..233*/ case 136: /* 6 GHz band; 20 MHz; channel 2 */ chandef->center_freq1 = control_freq; chandef->width = NL80211_CHAN_WIDTH_20; return true; case 83: /* 2 GHz band; 40 MHz; channels 1..9 */ case 116: /* 5 GHz band; 40 MHz; channels 36,44 */ case 119: /* 5 GHz band; 40 MHz; channels 52,60 */ case 122: /* 5 GHz band; 40 MHz; channels 100,108,116,124,132,140 */ case 126: /* 5 GHz band; 40 MHz; channels 149,157,165,173 */ chandef->center_freq1 = control_freq + 10; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 84: /* 2 GHz band; 40 MHz; channels 5..13 */ case 117: /* 5 GHz band; 40 MHz; channels 40,48 */ case 120: /* 5 GHz band; 40 MHz; channels 56,64 */ case 123: /* 5 GHz band; 40 MHz; channels 104,112,120,128,136,144 */ case 127: /* 5 GHz band; 40 MHz; channels 153,161,169,177 */ chandef->center_freq1 = control_freq - 10; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 132: /* 6 GHz band; 40 MHz; channels 1,5,..,229*/ chandef->center_freq1 = control_freq + 10 - (offset & 1) * 20; chandef->width = NL80211_CHAN_WIDTH_40; return true; case 128: /* 5 GHz band; 80 MHz; channels 36..64,100..144,149..177 */ case 133: /* 6 GHz band; 80 MHz; channels 1,5,..,229 */ chandef->center_freq1 = control_freq + 30 - (offset & 3) * 20; chandef->width = NL80211_CHAN_WIDTH_80; return true; case 129: /* 5 GHz band; 160 MHz; channels 36..64,100..144,149..177 */ case 134: /* 6 GHz band; 160 MHz; channels 1,5,..,229 */ chandef->center_freq1 = control_freq + 70 - (offset & 7) * 20; chandef->width = NL80211_CHAN_WIDTH_160; return true; case 130: /* 5 GHz band; 80+80 MHz; channels 36..64,100..144,149..177 */ case 135: /* 6 GHz band; 80+80 MHz; channels 1,5,..,229 */ /* The center_freq2 of 80+80 MHz is unknown */ case 137: /* 6 GHz band; 320 MHz; channels 1,5,..,229 */ /* 320-1 or 320-2 channelization is unknown */ default: return false; } } EXPORT_SYMBOL(ieee80211_operating_class_to_chandef); bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef, u8 *op_class) { u8 vht_opclass; u32 freq = chandef->center_freq1; if (freq >= 2412 && freq <= 2472) { if (chandef->width > NL80211_CHAN_WIDTH_40) return false; /* 2.407 GHz, channels 1..13 */ if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 83; /* HT40+ */ else *op_class = 84; /* HT40- */ } else { *op_class = 81; } return true; } if (freq == 2484) { /* channel 14 is only for IEEE 802.11b */ if (chandef->width != NL80211_CHAN_WIDTH_20_NOHT) return false; *op_class = 82; /* channel 14 */ return true; } switch (chandef->width) { case NL80211_CHAN_WIDTH_80: vht_opclass = 128; break; case NL80211_CHAN_WIDTH_160: vht_opclass = 129; break; case NL80211_CHAN_WIDTH_80P80: vht_opclass = 130; break; case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_5: return false; /* unsupported for now */ default: vht_opclass = 0; break; } /* 5 GHz, channels 36..48 */ if (freq >= 5180 && freq <= 5240) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 116; else *op_class = 117; } else { *op_class = 115; } return true; } /* 5 GHz, channels 52..64 */ if (freq >= 5260 && freq <= 5320) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 119; else *op_class = 120; } else { *op_class = 118; } return true; } /* 5 GHz, channels 100..144 */ if (freq >= 5500 && freq <= 5720) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 122; else *op_class = 123; } else { *op_class = 121; } return true; } /* 5 GHz, channels 149..169 */ if (freq >= 5745 && freq <= 5845) { if (vht_opclass) { *op_class = vht_opclass; } else if (chandef->width == NL80211_CHAN_WIDTH_40) { if (freq > chandef->chan->center_freq) *op_class = 126; else *op_class = 127; } else if (freq <= 5805) { *op_class = 124; } else { *op_class = 125; } return true; } /* 56.16 GHz, channel 1..4 */ if (freq >= 56160 + 2160 * 1 && freq <= 56160 + 2160 * 6) { if (chandef->width >= NL80211_CHAN_WIDTH_40) return false; *op_class = 180; return true; } /* not supported yet */ return false; } EXPORT_SYMBOL(ieee80211_chandef_to_operating_class); static int cfg80211_wdev_bi(struct wireless_dev *wdev) { switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: WARN_ON(wdev->valid_links); return wdev->links[0].ap.beacon_interval; case NL80211_IFTYPE_MESH_POINT: return wdev->u.mesh.beacon_interval; case NL80211_IFTYPE_ADHOC: return wdev->u.ibss.beacon_interval; default: break; } return 0; } static void cfg80211_calculate_bi_data(struct wiphy *wiphy, u32 new_beacon_int, u32 *beacon_int_gcd, bool *beacon_int_different, int radio_idx) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; *beacon_int_gcd = 0; *beacon_int_different = false; rdev = wiphy_to_rdev(wiphy); list_for_each_entry(wdev, &wiphy->wdev_list, list) { int wdev_bi; /* this feature isn't supported with MLO */ if (wdev->valid_links) continue; /* skip wdevs not active on the given wiphy radio */ if (radio_idx >= 0 && !(rdev_get_radio_mask(rdev, wdev->netdev) & BIT(radio_idx))) continue; wdev_bi = cfg80211_wdev_bi(wdev); if (!wdev_bi) continue; if (!*beacon_int_gcd) { *beacon_int_gcd = wdev_bi; continue; } if (wdev_bi == *beacon_int_gcd) continue; *beacon_int_different = true; *beacon_int_gcd = gcd(*beacon_int_gcd, wdev_bi); } if (new_beacon_int && *beacon_int_gcd != new_beacon_int) { if (*beacon_int_gcd) *beacon_int_different = true; *beacon_int_gcd = gcd(*beacon_int_gcd, new_beacon_int); } } int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, u32 beacon_int) { /* * This is just a basic pre-condition check; if interface combinations * are possible the driver must already be checking those with a call * to cfg80211_check_combinations(), in which case we'll validate more * through the cfg80211_calculate_bi_data() call and code in * cfg80211_iter_combinations(). */ if (beacon_int < 10 || beacon_int > 10000) return -EINVAL; return 0; } int cfg80211_iter_combinations(struct wiphy *wiphy, struct iface_combination_params *params, void (*iter)(const struct ieee80211_iface_combination *c, void *data), void *data) { const struct wiphy_radio *radio = NULL; const struct ieee80211_iface_combination *c, *cs; const struct ieee80211_regdomain *regdom; enum nl80211_dfs_regions region = 0; int i, j, n, iftype; int num_interfaces = 0; u32 used_iftypes = 0; u32 beacon_int_gcd; bool beacon_int_different; if (params->radio_idx >= 0) radio = &wiphy->radio[params->radio_idx]; /* * This is a bit strange, since the iteration used to rely only on * the data given by the driver, but here it now relies on context, * in form of the currently operating interfaces. * This is OK for all current users, and saves us from having to * push the GCD calculations into all the drivers. * In the future, this should probably rely more on data that's in * cfg80211 already - the only thing not would appear to be any new * interfaces (while being brought up) and channel/radar data. */ cfg80211_calculate_bi_data(wiphy, params->new_beacon_int, &beacon_int_gcd, &beacon_int_different, params->radio_idx); if (params->radar_detect) { rcu_read_lock(); regdom = rcu_dereference(cfg80211_regdomain); if (regdom) region = regdom->dfs_region; rcu_read_unlock(); } for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { num_interfaces += params->iftype_num[iftype]; if (params->iftype_num[iftype] > 0 && !cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) used_iftypes |= BIT(iftype); } if (radio) { cs = radio->iface_combinations; n = radio->n_iface_combinations; } else { cs = wiphy->iface_combinations; n = wiphy->n_iface_combinations; } for (i = 0; i < n; i++) { struct ieee80211_iface_limit *limits; u32 all_iftypes = 0; c = &cs[i]; if (num_interfaces > c->max_interfaces) continue; if (params->num_different_channels > c->num_different_channels) continue; limits = kmemdup_array(c->limits, c->n_limits, sizeof(*limits), GFP_KERNEL); if (!limits) return -ENOMEM; for (iftype = 0; iftype < NUM_NL80211_IFTYPES; iftype++) { if (cfg80211_iftype_allowed(wiphy, iftype, 0, 1)) continue; for (j = 0; j < c->n_limits; j++) { all_iftypes |= limits[j].types; if (!(limits[j].types & BIT(iftype))) continue; if (limits[j].max < params->iftype_num[iftype]) goto cont; limits[j].max -= params->iftype_num[iftype]; } } if (params->radar_detect != (c->radar_detect_widths & params->radar_detect)) goto cont; if (params->radar_detect && c->radar_detect_regions && !(c->radar_detect_regions & BIT(region))) goto cont; /* Finally check that all iftypes that we're currently * using are actually part of this combination. If they * aren't then we can't use this combination and have * to continue to the next. */ if ((all_iftypes & used_iftypes) != used_iftypes) goto cont; if (beacon_int_gcd) { if (c->beacon_int_min_gcd && beacon_int_gcd < c->beacon_int_min_gcd) goto cont; if (!c->beacon_int_min_gcd && beacon_int_different) goto cont; } /* This combination covered all interface types and * supported the requested numbers, so we're good. */ (*iter)(c, data); cont: kfree(limits); } return 0; } EXPORT_SYMBOL(cfg80211_iter_combinations); static void cfg80211_iter_sum_ifcombs(const struct ieee80211_iface_combination *c, void *data) { int *num = data; (*num)++; } int cfg80211_check_combinations(struct wiphy *wiphy, struct iface_combination_params *params) { int err, num = 0; err = cfg80211_iter_combinations(wiphy, params, cfg80211_iter_sum_ifcombs, &num); if (err) return err; if (num == 0) return -EBUSY; return 0; } EXPORT_SYMBOL(cfg80211_check_combinations); int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, const u8 *rates, unsigned int n_rates, u32 *mask) { int i, j; if (!sband) return -EINVAL; if (n_rates == 0 || n_rates > NL80211_MAX_SUPP_RATES) return -EINVAL; *mask = 0; for (i = 0; i < n_rates; i++) { int rate = (rates[i] & 0x7f) * 5; bool found = false; for (j = 0; j < sband->n_bitrates; j++) { if (sband->bitrates[j].bitrate == rate) { found = true; *mask |= BIT(j); break; } } if (!found) return -EINVAL; } /* * mask must have at least one bit set here since we * didn't accept a 0-length rates array nor allowed * entries in the array that didn't exist */ return 0; } unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy) { enum nl80211_band band; unsigned int n_channels = 0; for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) n_channels += wiphy->bands[band]->n_channels; return n_channels; } EXPORT_SYMBOL(ieee80211_get_num_supported_channels); int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, struct station_info *sinfo) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; int ret; wdev = dev->ieee80211_ptr; if (!wdev) return -EOPNOTSUPP; rdev = wiphy_to_rdev(wdev->wiphy); if (!rdev->ops->get_station) return -EOPNOTSUPP; memset(sinfo, 0, sizeof(*sinfo)); wiphy_lock(&rdev->wiphy); ret = rdev_get_station(rdev, dev, mac_addr, sinfo); wiphy_unlock(&rdev->wiphy); return ret; } EXPORT_SYMBOL(cfg80211_get_station); void cfg80211_free_nan_func(struct cfg80211_nan_func *f) { int i; if (!f) return; kfree(f->serv_spec_info); kfree(f->srf_bf); kfree(f->srf_macs); for (i = 0; i < f->num_rx_filters; i++) kfree(f->rx_filters[i].filter); for (i = 0; i < f->num_tx_filters; i++) kfree(f->tx_filters[i].filter); kfree(f->rx_filters); kfree(f->tx_filters); kfree(f); } EXPORT_SYMBOL(cfg80211_free_nan_func); bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, u32 center_freq_khz, u32 bw_khz) { u32 start_freq_khz, end_freq_khz; start_freq_khz = center_freq_khz - (bw_khz / 2); end_freq_khz = center_freq_khz + (bw_khz / 2); if (start_freq_khz >= freq_range->start_freq_khz && end_freq_khz <= freq_range->end_freq_khz) return true; return false; } int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) { sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, sizeof(*(sinfo->pertid)), gfp); if (!sinfo->pertid) return -ENOMEM; return 0; } EXPORT_SYMBOL(cfg80211_sinfo_alloc_tid_stats); /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ /* Ethernet-II snap header (RFC1042 for most EtherTypes) */ const unsigned char rfc1042_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 }; EXPORT_SYMBOL(rfc1042_header); /* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */ const unsigned char bridge_tunnel_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; EXPORT_SYMBOL(bridge_tunnel_header); /* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */ struct iapp_layer2_update { u8 da[ETH_ALEN]; /* broadcast */ u8 sa[ETH_ALEN]; /* STA addr */ __be16 len; /* 6 */ u8 dsap; /* 0 */ u8 ssap; /* 0 */ u8 control; u8 xid_info[3]; } __packed; void cfg80211_send_layer2_update(struct net_device *dev, const u8 *addr) { struct iapp_layer2_update *msg; struct sk_buff *skb; /* Send Level 2 Update Frame to update forwarding tables in layer 2 * bridge devices */ skb = dev_alloc_skb(sizeof(*msg)); if (!skb) return; msg = skb_put(skb, sizeof(*msg)); /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID) * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */ eth_broadcast_addr(msg->da); ether_addr_copy(msg->sa, addr); msg->len = htons(6); msg->dsap = 0; msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */ msg->control = 0xaf; /* XID response lsb.1111F101. * F=0 (no poll command; unsolicited frame) */ msg->xid_info[0] = 0x81; /* XID format identifier */ msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */ msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */ skb->dev = dev; skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); netif_rx(skb); } EXPORT_SYMBOL(cfg80211_send_layer2_update); int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, enum ieee80211_vht_chanwidth bw, int mcs, bool ext_nss_bw_capable, unsigned int max_vht_nss) { u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map); int ext_nss_bw; int supp_width; int i, mcs_encoding; if (map == 0xffff) return 0; if (WARN_ON(mcs > 9 || max_vht_nss > 8)) return 0; if (mcs <= 7) mcs_encoding = 0; else if (mcs == 8) mcs_encoding = 1; else mcs_encoding = 2; if (!max_vht_nss) { /* find max_vht_nss for the given MCS */ for (i = 7; i >= 0; i--) { int supp = (map >> (2 * i)) & 3; if (supp == 3) continue; if (supp >= mcs_encoding) { max_vht_nss = i + 1; break; } } } if (!(cap->supp_mcs.tx_mcs_map & cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE))) return max_vht_nss; ext_nss_bw = le32_get_bits(cap->vht_cap_info, IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); supp_width = le32_get_bits(cap->vht_cap_info, IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); /* if not capable, treat ext_nss_bw as 0 */ if (!ext_nss_bw_capable) ext_nss_bw = 0; /* This is invalid */ if (supp_width == 3) return 0; /* This is an invalid combination so pretend nothing is supported */ if (supp_width == 2 && (ext_nss_bw == 1 || ext_nss_bw == 2)) return 0; /* * Cover all the special cases according to IEEE 802.11-2016 * Table 9-250. All other cases are either factor of 1 or not * valid/supported. */ switch (bw) { case IEEE80211_VHT_CHANWIDTH_USE_HT: case IEEE80211_VHT_CHANWIDTH_80MHZ: if ((supp_width == 1 || supp_width == 2) && ext_nss_bw == 3) return 2 * max_vht_nss; break; case IEEE80211_VHT_CHANWIDTH_160MHZ: if (supp_width == 0 && (ext_nss_bw == 1 || ext_nss_bw == 2)) return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 3) return 2 * max_vht_nss; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: if (supp_width == 0 && ext_nss_bw == 1) return 0; /* not possible */ if (supp_width == 0 && ext_nss_bw == 2) return max_vht_nss / 2; if (supp_width == 0 && ext_nss_bw == 3) return (3 * max_vht_nss) / 4; if (supp_width == 1 && ext_nss_bw == 0) return 0; /* not possible */ if (supp_width == 1 && ext_nss_bw == 1) return max_vht_nss / 2; if (supp_width == 1 && ext_nss_bw == 2) return (3 * max_vht_nss) / 4; break; } /* not covered or invalid combination received */ return max_vht_nss; } EXPORT_SYMBOL(ieee80211_get_vht_max_nss); bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype, bool is_4addr, u8 check_swif) { bool is_vlan = iftype == NL80211_IFTYPE_AP_VLAN; switch (check_swif) { case 0: if (is_vlan && is_4addr) return wiphy->flags & WIPHY_FLAG_4ADDR_AP; return wiphy->interface_modes & BIT(iftype); case 1: if (!(wiphy->software_iftypes & BIT(iftype)) && is_vlan) return wiphy->flags & WIPHY_FLAG_4ADDR_AP; return wiphy->software_iftypes & BIT(iftype); default: break; } return false; } EXPORT_SYMBOL(cfg80211_iftype_allowed); void cfg80211_remove_link(struct wireless_dev *wdev, unsigned int link_id) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); lockdep_assert_wiphy(wdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, wdev->netdev, link_id, true); break; default: /* per-link not relevant */ break; } rdev_del_intf_link(rdev, wdev, link_id); wdev->valid_links &= ~BIT(link_id); eth_zero_addr(wdev->links[link_id].addr); } void cfg80211_remove_links(struct wireless_dev *wdev) { unsigned int link_id; /* * links are controlled by upper layers (userspace/cfg) * only for AP mode, so only remove them here for AP */ if (wdev->iftype != NL80211_IFTYPE_AP) return; if (wdev->valid_links) { for_each_valid_link(wdev, link_id) cfg80211_remove_link(wdev, link_id); } } int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { cfg80211_remove_links(wdev); return rdev_del_virtual_intf(rdev, wdev); } const struct wiphy_iftype_ext_capab * cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type) { int i; for (i = 0; i < wiphy->num_iftype_ext_capab; i++) { if (wiphy->iftype_ext_capab[i].iftype == type) return &wiphy->iftype_ext_capab[i]; } return NULL; } EXPORT_SYMBOL(cfg80211_get_iftype_ext_capa); static bool ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio, u32 freq, u32 width) { const struct wiphy_radio_freq_range *r; int i; for (i = 0; i < radio->n_freq_range; i++) { r = &radio->freq_range[i]; if (freq - width / 2 >= r->start_freq && freq + width / 2 <= r->end_freq) return true; } return false; } bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, const struct cfg80211_chan_def *chandef) { u32 freq, width; freq = ieee80211_chandef_to_khz(chandef); width = nl80211_chan_width_to_mhz(chandef->width); if (!ieee80211_radio_freq_range_valid(radio, freq, width)) return false; freq = MHZ_TO_KHZ(chandef->center_freq2); if (freq && !ieee80211_radio_freq_range_valid(radio, freq, width)) return false; return true; } EXPORT_SYMBOL(cfg80211_radio_chandef_valid); bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev, struct ieee80211_channel *chan) { struct wiphy *wiphy = wdev->wiphy; const struct wiphy_radio *radio; struct cfg80211_chan_def chandef; u32 radio_mask; int i; radio_mask = wdev->radio_mask; if (!wiphy->n_radio || radio_mask == BIT(wiphy->n_radio) - 1) return true; cfg80211_chandef_create(&chandef, chan, NL80211_CHAN_HT20); for (i = 0; i < wiphy->n_radio; i++) { if (!(radio_mask & BIT(i))) continue; radio = &wiphy->radio[i]; if (!cfg80211_radio_chandef_valid(radio, &chandef)) continue; return true; } return false; } EXPORT_SYMBOL(cfg80211_wdev_channel_allowed);
3 3 8 9 5 37 36 36 1 4 3 2 7 4 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _DCCP_H #define _DCCP_H /* * net/dccp/dccp.h * * An implementation of the DCCP protocol * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz> */ #include <linux/dccp.h> #include <linux/ktime.h> #include <net/snmp.h> #include <net/sock.h> #include <net/tcp.h> #include "ackvec.h" /* * DCCP - specific warning and debugging macros. */ #define DCCP_WARN(fmt, ...) \ net_warn_ratelimited("%s: " fmt, __func__, ##__VA_ARGS__) #define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \ __FILE__, __LINE__, __func__) #define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0) #define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \ DCCP_BUG("\"%s\" holds (exception!)", \ __stringify(cond)); \ } while (0) #define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \ printk(fmt, ##args); \ } while(0) #define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \ "%s: " fmt, __func__, ##a) #ifdef CONFIG_IP_DCCP_DEBUG extern bool dccp_debug; #define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) #define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) #else #define dccp_pr_debug(format, a...) do {} while (0) #define dccp_pr_debug_cat(format, a...) do {} while (0) #define dccp_debug(format, a...) do {} while (0) #endif extern struct inet_hashinfo dccp_hashinfo; DECLARE_PER_CPU(unsigned int, dccp_orphan_count); void dccp_time_wait(struct sock *sk, int state, int timeo); /* * Set safe upper bounds for header and option length. Since Data Offset is 8 * bits (RFC 4340, sec. 5.1), the total header length can never be more than * 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1): * - DCCP-Response with ACK Subheader and 4 bytes of Service code OR * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields * Hence a safe upper bound for the maximum option length is 1020-28 = 992 */ #define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) #define DCCP_MAX_PACKET_HDR 28 #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) /* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ #define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT * state, about 60 seconds */ /* RFC 1122, 4.2.3.1 initial RTO value */ #define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ)) /* * The maximum back-off value for retransmissions. This is needed for * - retransmitting client-Requests (sec. 8.1.1), * - retransmitting Close/CloseReq when closing (sec. 8.3), * - feature-negotiation retransmission (sec. 6.6.3), * - Acks in client-PARTOPEN state (sec. 8.1.5). */ #define DCCP_RTO_MAX ((unsigned int)(64 * HZ)) /* * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 */ #define DCCP_SANE_RTT_MIN 100 #define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) #define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) /* sysctl variables for DCCP */ extern int sysctl_dccp_request_retries; extern int sysctl_dccp_retries1; extern int sysctl_dccp_retries2; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; /* * 48-bit sequence number arithmetic (signed and unsigned) */ #define INT48_MIN 0x800000000000LL /* 2^47 */ #define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */ #define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */ #define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x))) #define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x))) #define ADD48(a, b) (((a) + (b)) & UINT48_MAX) #define SUB48(a, b) ADD48((a), COMPLEMENT48(b)) static inline void dccp_inc_seqno(u64 *seqno) { *seqno = ADD48(*seqno, 1); } /* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */ static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2) { u64 delta = SUB48(seqno2, seqno1); return TO_SIGNED48(delta); } /* is seq1 < seq2 ? */ static inline int before48(const u64 seq1, const u64 seq2) { return (s64)((seq2 << 16) - (seq1 << 16)) > 0; } /* is seq1 > seq2 ? */ #define after48(seq1, seq2) before48(seq2, seq1) /* is seq2 <= seq1 <= seq3 ? */ static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3) { return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16); } /** * dccp_loss_count - Approximate the number of lost data packets in a burst loss * @s1: last known sequence number before the loss ('hole') * @s2: first sequence number seen after the 'hole' * @ndp: NDP count on packet with sequence number @s2 */ static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp) { s64 delta = dccp_delta_seqno(s1, s2); WARN_ON(delta < 0); delta -= ndp + 1; return delta > 0 ? delta : 0; } /** * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1 */ static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) { return dccp_loss_count(s1, s2, ndp) == 0; } enum { DCCP_MIB_NUM = 0, DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ DCCP_MIB_ESTABRESETS, /* EstabResets */ DCCP_MIB_CURRESTAB, /* CurrEstab */ DCCP_MIB_OUTSEGS, /* OutSegs */ DCCP_MIB_OUTRSTS, DCCP_MIB_ABORTONTIMEOUT, DCCP_MIB_TIMEOUTS, DCCP_MIB_ABORTFAILED, DCCP_MIB_PASSIVEOPENS, DCCP_MIB_ATTEMPTFAILS, DCCP_MIB_OUTDATAGRAMS, DCCP_MIB_INERRS, DCCP_MIB_OPTMANDATORYERROR, DCCP_MIB_INVALIDOPT, __DCCP_MIB_MAX }; #define DCCP_MIB_MAX __DCCP_MIB_MAX struct dccp_mib { unsigned long mibs[DCCP_MIB_MAX]; }; DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); #define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) #define __DCCP_INC_STATS(field) __SNMP_INC_STATS(dccp_statistics, field) #define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) /* * Checksumming routines */ static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb) { const struct dccp_hdr* dh = dccp_hdr(skb); if (dh->dccph_cscov == 0) return skb->len; return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32); } static inline void dccp_csum_outgoing(struct sk_buff *skb) { unsigned int cov = dccp_csum_coverage(skb); if (cov >= skb->len) dccp_hdr(skb)->dccph_cscov = 0; skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0); } void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb); int dccp_retransmit_skb(struct sock *sk); void dccp_send_ack(struct sock *sk); void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *rsk); void dccp_send_sync(struct sock *sk, const u64 seq, const enum dccp_pkt_type pkt_type); /* * TX Packet Dequeueing Interface */ void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); bool dccp_qpolicy_full(struct sock *sk); void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); struct sk_buff *dccp_qpolicy_top(struct sock *sk); struct sk_buff *dccp_qpolicy_pop(struct sock *sk); bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); /* * TX Packet Output and TX Timers */ void dccp_write_xmit(struct sock *sk); void dccp_write_space(struct sock *sk); void dccp_flush_write_queue(struct sock *sk, long *time_budget); void dccp_init_xmit_timers(struct sock *sk); static inline void dccp_clear_xmit_timers(struct sock *sk) { inet_csk_clear_xmit_timers(sk); } unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); const char *dccp_packet_name(const int type); void dccp_set_state(struct sock *sk, const int state); void dccp_done(struct sock *sk); int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, struct sk_buff const *skb); int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *dccp_create_openreq_child(const struct sock *sk, const struct request_sock *req, const struct sk_buff *skb); int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req); struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req); int dccp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct dccp_hdr *dh, unsigned int len); int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned int len); void dccp_destruct_common(struct sock *sk); int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized); void dccp_destroy_sock(struct sock *sk); void dccp_close(struct sock *sk, long timeout); struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, struct request_sock *req); int dccp_connect(struct sock *sk); int dccp_disconnect(struct sock *sk, int flags); int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int dccp_ioctl(struct sock *sk, int cmd, int *karg); int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); void dccp_shutdown(struct sock *sk, int how); int inet_dccp_listen(struct socket *sock, int backlog); __poll_t dccp_poll(struct file *file, struct socket *sock, poll_table *wait); int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); void dccp_req_err(struct sock *sk, u64 seq); struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb); int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); void dccp_send_close(struct sock *sk, const int active); int dccp_invalid_packet(struct sk_buff *skb); u32 dccp_sample_rtt(struct sock *sk, long delta); static inline bool dccp_bad_service_code(const struct sock *sk, const __be32 service) { const struct dccp_sock *dp = dccp_sk(sk); if (dp->dccps_service == service) return false; return !dccp_list_has_service(dp->dccps_service_list, service); } /** * dccp_skb_cb - DCCP per-packet control information * @dccpd_type: one of %dccp_pkt_type (or unknown) * @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1 * @dccpd_reset_code: one of %dccp_reset_codes * @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code) * @dccpd_opt_len: total length of all options (5.8) in the packet * @dccpd_seq: sequence number * @dccpd_ack_seq: acknowledgment number subheader field value * * This is used for transmission as well as for reception. */ struct dccp_skb_cb { union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; __u8 dccpd_type:4; __u8 dccpd_ccval:4; __u8 dccpd_reset_code, dccpd_reset_data[3]; __u16 dccpd_opt_len; __u64 dccpd_seq; __u64 dccpd_ack_seq; }; #define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0])) /* RFC 4340, sec. 7.7 */ static inline int dccp_non_data_packet(const struct sk_buff *skb) { const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; return type == DCCP_PKT_ACK || type == DCCP_PKT_CLOSE || type == DCCP_PKT_CLOSEREQ || type == DCCP_PKT_RESET || type == DCCP_PKT_SYNC || type == DCCP_PKT_SYNCACK; } /* RFC 4340, sec. 7.7 */ static inline int dccp_data_packet(const struct sk_buff *skb) { const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; return type == DCCP_PKT_DATA || type == DCCP_PKT_DATAACK || type == DCCP_PKT_REQUEST || type == DCCP_PKT_RESPONSE; } static inline int dccp_packet_without_ack(const struct sk_buff *skb) { const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST; } #define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2) static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) { struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh + sizeof(*dh)); dh->dccph_seq2 = 0; dh->dccph_seq = htons((gss >> 32) & 0xfffff); dhx->dccph_seq_low = htonl(gss & 0xffffffff); } static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, const u64 gsr) { dhack->dccph_reserved1 = 0; dhack->dccph_ack_nr_high = htons(gsr >> 32); dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff); } static inline void dccp_update_gsr(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); if (after48(seq, dp->dccps_gsr)) dp->dccps_gsr = seq; /* Sequence validity window depends on remote Sequence Window (7.5.1) */ dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); /* * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, * 7.5.1 we perform this check beyond the initial handshake: W/W' are * always > 32, so for the first W/W' packets in the lifetime of a * connection we always have to adjust SWL. * A second reason why we are doing this is that the window depends on * the feature-remote value of Sequence Window: nothing stops the peer * from updating this value while we are busy adjusting SWL for the * first W packets (we would have to count from scratch again then). * Therefore it is safer to always make sure that the Sequence Window * is not artificially extended by a peer who grows SWL downwards by * continually updating the feature-remote Sequence-Window. * If sequence numbers wrap it is bad luck. But that will take a while * (48 bit), and this measure prevents Sequence-number attacks. */ if (before48(dp->dccps_swl, dp->dccps_isr)) dp->dccps_swl = dp->dccps_isr; dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); } static inline void dccp_update_gss(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); dp->dccps_gss = seq; /* Ack validity window depends on local Sequence Window value (7.5.1) */ dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); /* Adjust AWL so that it is not below ISS - see comment above for SWL */ if (before48(dp->dccps_awl, dp->dccps_iss)) dp->dccps_awl = dp->dccps_iss; dp->dccps_awh = dp->dccps_gss; } static inline int dccp_ackvec_pending(const struct sock *sk) { return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); } static inline int dccp_ack_pending(const struct sock *sk) { return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); } int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); int dccp_feat_finalise_settings(struct dccp_sock *dp); int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, struct sk_buff *skb); int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); void dccp_feat_list_purge(struct list_head *fn_list); int dccp_insert_options(struct sock *sk, struct sk_buff *skb); int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *); u32 dccp_timestamp(void); void dccp_timestamping_init(void); int dccp_insert_option(struct sk_buff *skb, unsigned char option, const void *value, unsigned char len); #ifdef CONFIG_SYSCTL int dccp_sysctl_init(void); void dccp_sysctl_exit(void); #else static inline int dccp_sysctl_init(void) { return 0; } static inline void dccp_sysctl_exit(void) { } #endif #endif /* _DCCP_H */
9712 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_X86_XSAVE_H #define __ASM_X86_XSAVE_H #include <linux/uaccess.h> #include <linux/types.h> #include <asm/processor.h> #include <asm/fpu/api.h> #include <asm/user.h> /* Bit 63 of XCR0 is reserved for future expansion */ #define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63))) #define XSTATE_CPUID 0x0000000d #define TILE_CPUID 0x0000001d #define FXSAVE_SIZE 512 #define XSAVE_HDR_SIZE 64 #define XSAVE_HDR_OFFSET FXSAVE_SIZE #define XSAVE_YMM_SIZE 256 #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) #define XSAVE_ALIGNMENT 64 /* All currently supported user features */ #define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ XFEATURE_MASK_YMM | \ XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM | \ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_XTILE) /* * Features which are restored when returning to user space. * PKRU is not restored on return to user space because PKRU * is switched eagerly in switch_to() and flush_thread() */ #define XFEATURE_MASK_USER_RESTORE \ (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU) /* Features which are dynamically enabled for a process on request */ #define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA /* All currently supported supervisor features */ #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER) /* * A supervisor state component may not always contain valuable information, * and its size may be huge. Saving/restoring such supervisor state components * at each context switch can cause high CPU and space overhead, which should * be avoided. Such supervisor state components should only be saved/restored * on demand. The on-demand supervisor features are set in this mask. * * Unlike the existing supported supervisor features, an independent supervisor * feature does not allocate a buffer in task->fpu, and the corresponding * supervisor state component cannot be saved/restored at each context switch. * * To support an independent supervisor feature, a developer should follow the * dos and don'ts as below: * - Do dynamically allocate a buffer for the supervisor state component. * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the * state component to/from the buffer. * - Don't set the bit corresponding to the independent supervisor feature in * IA32_XSS at run time, since it has been set at boot time. */ #define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR) /* * Unsupported supervisor features. When a supervisor feature in this mask is * supported in the future, move it to the supported supervisor feature mask. */ #define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \ XFEATURE_MASK_CET_KERNEL) /* All supervisor states including supported and unsupported states. */ #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ XFEATURE_MASK_INDEPENDENT | \ XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) /* * The feature mask required to restore FPU state: * - All user states which are not eagerly switched in switch_to()/exec() * - The suporvisor states */ #define XFEATURE_MASK_FPSTATE (XFEATURE_MASK_USER_RESTORE | \ XFEATURE_MASK_SUPERVISOR_SUPPORTED) /* * Features in this mask have space allocated in the signal frame, but may not * have that space initialized when the feature is in its init state. */ #define XFEATURE_MASK_SIGFRAME_INITOPT (XFEATURE_MASK_XTILE | \ XFEATURE_MASK_USER_DYNAMIC) extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask); int xfeature_size(int xfeature_nr); void xsaves(struct xregs_state *xsave, u64 mask); void xrstors(struct xregs_state *xsave, u64 mask); int xfd_enable_feature(u64 xfd_err); #ifdef CONFIG_X86_64 DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); #endif #ifdef CONFIG_X86_64 DECLARE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); static __always_inline __pure bool fpu_state_size_dynamic(void) { return static_branch_unlikely(&__fpu_state_size_dynamic); } #else static __always_inline __pure bool fpu_state_size_dynamic(void) { return false; } #endif #endif
30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 // SPDX-License-Identifier: GPL-2.0-or-later /* * Helpers for initial module or kernel cmdline parsing * Copyright (C) 2001 Rusty Russell. */ #include <linux/ctype.h> #include <linux/device.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/kstrtox.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/overflow.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/string.h> #ifdef CONFIG_SYSFS /* Protects all built-in parameters, modules use their own param_lock */ static DEFINE_MUTEX(param_lock); /* Use the module's mutex, or if built-in use the built-in mutex */ #ifdef CONFIG_MODULES #define KPARAM_MUTEX(mod) ((mod) ? &(mod)->param_lock : &param_lock) #else #define KPARAM_MUTEX(mod) (&param_lock) #endif static inline void check_kparam_locked(struct module *mod) { BUG_ON(!mutex_is_locked(KPARAM_MUTEX(mod))); } #else static inline void check_kparam_locked(struct module *mod) { } #endif /* !CONFIG_SYSFS */ /* This just allows us to keep track of which parameters are kmalloced. */ struct kmalloced_param { struct list_head list; char val[]; }; static LIST_HEAD(kmalloced_params); static DEFINE_SPINLOCK(kmalloced_params_lock); static void *kmalloc_parameter(unsigned int size) { struct kmalloced_param *p; p = kmalloc(size_add(sizeof(*p), size), GFP_KERNEL); if (!p) return NULL; spin_lock(&kmalloced_params_lock); list_add(&p->list, &kmalloced_params); spin_unlock(&kmalloced_params_lock); return p->val; } /* Does nothing if parameter wasn't kmalloced above. */ static void maybe_kfree_parameter(void *param) { struct kmalloced_param *p; spin_lock(&kmalloced_params_lock); list_for_each_entry(p, &kmalloced_params, list) { if (p->val == param) { list_del(&p->list); kfree(p); break; } } spin_unlock(&kmalloced_params_lock); } static char dash2underscore(char c) { if (c == '-') return '_'; return c; } bool parameqn(const char *a, const char *b, size_t n) { size_t i; for (i = 0; i < n; i++) { if (dash2underscore(a[i]) != dash2underscore(b[i])) return false; } return true; } bool parameq(const char *a, const char *b) { return parameqn(a, b, strlen(a)+1); } static bool param_check_unsafe(const struct kernel_param *kp) { if (kp->flags & KERNEL_PARAM_FL_HWPARAM && security_locked_down(LOCKDOWN_MODULE_PARAMETERS)) return false; if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { pr_notice("Setting dangerous option %s - tainting kernel\n", kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } return true; } static int parse_one(char *param, char *val, const char *doing, const struct kernel_param *params, unsigned num_params, s16 min_level, s16 max_level, void *arg, parse_unknown_fn handle_unknown) { unsigned int i; int err; /* Find parameter */ for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { if (params[i].level < min_level || params[i].level > max_level) return 0; /* No one handled NULL, so do it here. */ if (!val && !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG)) return -EINVAL; pr_debug("handling %s with %p\n", param, params[i].ops->set); kernel_param_lock(params[i].mod); if (param_check_unsafe(&params[i])) err = params[i].ops->set(val, &params[i]); else err = -EPERM; kernel_param_unlock(params[i].mod); return err; } } if (handle_unknown) { pr_debug("doing %s: %s='%s'\n", doing, param, val); return handle_unknown(param, val, doing, arg); } pr_debug("Unknown argument '%s'\n", param); return -ENOENT; } /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ char *parse_args(const char *doing, char *args, const struct kernel_param *params, unsigned num, s16 min_level, s16 max_level, void *arg, parse_unknown_fn unknown) { char *param, *val, *err = NULL; /* Chew leading spaces */ args = skip_spaces(args); if (*args) pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args); while (*args) { int ret; int irq_was_disabled; args = next_arg(args, &param, &val); /* Stop at -- */ if (!val && strcmp(param, "--") == 0) return err ?: args; irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, doing, params, num, min_level, max_level, arg, unknown); if (irq_was_disabled && !irqs_disabled()) pr_warn("%s: option '%s' enabled irq's!\n", doing, param); switch (ret) { case 0: continue; case -ENOENT: pr_err("%s: Unknown parameter `%s'\n", doing, param); break; case -ENOSPC: pr_err("%s: `%s' too large for parameter `%s'\n", doing, val ?: "", param); break; default: pr_err("%s: `%s' invalid for parameter `%s'\n", doing, val ?: "", param); break; } err = ERR_PTR(ret); } return err; } /* Lazy bastard, eh? */ #define STANDARD_PARAM_DEF(name, type, format, strtolfn) \ int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ return strtolfn(val, 0, (type *)kp->arg); \ } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ return scnprintf(buffer, PAGE_SIZE, format "\n", \ *((type *)kp->arg)); \ } \ const struct kernel_param_ops param_ops_##name = { \ .set = param_set_##name, \ .get = param_get_##name, \ }; \ EXPORT_SYMBOL(param_set_##name); \ EXPORT_SYMBOL(param_get_##name); \ EXPORT_SYMBOL(param_ops_##name) STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); STANDARD_PARAM_DEF(long, long, "%li", kstrtol); STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull); STANDARD_PARAM_DEF(hexint, unsigned int, "%#08x", kstrtouint); int param_set_uint_minmax(const char *val, const struct kernel_param *kp, unsigned int min, unsigned int max) { unsigned int num; int ret; if (!val) return -EINVAL; ret = kstrtouint(val, 0, &num); if (ret) return ret; if (num < min || num > max) return -EINVAL; *((unsigned int *)kp->arg) = num; return 0; } EXPORT_SYMBOL_GPL(param_set_uint_minmax); int param_set_charp(const char *val, const struct kernel_param *kp) { size_t len, maxlen = 1024; len = strnlen(val, maxlen + 1); if (len == maxlen + 1) { pr_err("%s: string parameter too long\n", kp->name); return -ENOSPC; } maybe_kfree_parameter(*(char **)kp->arg); /* * This is a hack. We can't kmalloc() in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { *(char **)kp->arg = kmalloc_parameter(len + 1); if (!*(char **)kp->arg) return -ENOMEM; strcpy(*(char **)kp->arg, val); } else *(const char **)kp->arg = val; return 0; } EXPORT_SYMBOL(param_set_charp); int param_get_charp(char *buffer, const struct kernel_param *kp) { return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg)); } EXPORT_SYMBOL(param_get_charp); void param_free_charp(void *arg) { maybe_kfree_parameter(*((char **)arg)); } EXPORT_SYMBOL(param_free_charp); const struct kernel_param_ops param_ops_charp = { .set = param_set_charp, .get = param_get_charp, .free = param_free_charp, }; EXPORT_SYMBOL(param_ops_charp); /* Actually could be a bool or an int, for historical reasons. */ int param_set_bool(const char *val, const struct kernel_param *kp) { /* No equals means "set"... */ if (!val) val = "1"; /* One of =[yYnN01] */ return kstrtobool(val, kp->arg); } EXPORT_SYMBOL(param_set_bool); int param_get_bool(char *buffer, const struct kernel_param *kp) { /* Y and N chosen as being relatively non-coder friendly */ return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N'); } EXPORT_SYMBOL(param_get_bool); const struct kernel_param_ops param_ops_bool = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool, .get = param_get_bool, }; EXPORT_SYMBOL(param_ops_bool); int param_set_bool_enable_only(const char *val, const struct kernel_param *kp) { int err; bool new_value; bool orig_value = *(bool *)kp->arg; struct kernel_param dummy_kp = *kp; dummy_kp.arg = &new_value; err = param_set_bool(val, &dummy_kp); if (err) return err; /* Don't let them unset it once it's set! */ if (!new_value && orig_value) return -EROFS; if (new_value) err = param_set_bool(val, kp); return err; } EXPORT_SYMBOL_GPL(param_set_bool_enable_only); const struct kernel_param_ops param_ops_bool_enable_only = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool_enable_only, .get = param_get_bool, }; EXPORT_SYMBOL_GPL(param_ops_bool_enable_only); /* This one must be bool. */ int param_set_invbool(const char *val, const struct kernel_param *kp) { int ret; bool boolval; struct kernel_param dummy; dummy.arg = &boolval; ret = param_set_bool(val, &dummy); if (ret == 0) *(bool *)kp->arg = !boolval; return ret; } EXPORT_SYMBOL(param_set_invbool); int param_get_invbool(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y'); } EXPORT_SYMBOL(param_get_invbool); const struct kernel_param_ops param_ops_invbool = { .set = param_set_invbool, .get = param_get_invbool, }; EXPORT_SYMBOL(param_ops_invbool); int param_set_bint(const char *val, const struct kernel_param *kp) { /* Match bool exactly, by re-using it. */ struct kernel_param boolkp = *kp; bool v; int ret; boolkp.arg = &v; ret = param_set_bool(val, &boolkp); if (ret == 0) *(int *)kp->arg = v; return ret; } EXPORT_SYMBOL(param_set_bint); const struct kernel_param_ops param_ops_bint = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bint, .get = param_get_int, }; EXPORT_SYMBOL(param_ops_bint); /* We break the rule and mangle the string. */ static int param_array(struct module *mod, const char *name, const char *val, unsigned int min, unsigned int max, void *elem, int elemsize, int (*set)(const char *, const struct kernel_param *kp), s16 level, unsigned int *num) { int ret; struct kernel_param kp; char save; /* Get the name right for errors. */ kp.name = name; kp.arg = elem; kp.level = level; *num = 0; /* We expect a comma-separated list of values. */ do { int len; if (*num == max) { pr_err("%s: can only take %i arguments\n", name, max); return -EINVAL; } len = strcspn(val, ","); /* nul-terminate and parse */ save = val[len]; ((char *)val)[len] = '\0'; check_kparam_locked(mod); ret = set(val, &kp); if (ret != 0) return ret; kp.arg += elemsize; val += len+1; (*num)++; } while (save == ','); if (*num < min) { pr_err("%s: needs at least %i arguments\n", name, min); return -EINVAL; } return 0; } static int param_array_set(const char *val, const struct kernel_param *kp) { const struct kparam_array *arr = kp->arr; unsigned int temp_num; return param_array(kp->mod, kp->name, val, 1, arr->max, arr->elem, arr->elemsize, arr->ops->set, kp->level, arr->num ?: &temp_num); } static int param_array_get(char *buffer, const struct kernel_param *kp) { int i, off, ret; const struct kparam_array *arr = kp->arr; struct kernel_param p = *kp; for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { /* Replace \n with comma */ if (i) buffer[off - 1] = ','; p.arg = arr->elem + arr->elemsize * i; check_kparam_locked(p.mod); ret = arr->ops->get(buffer + off, &p); if (ret < 0) return ret; off += ret; } buffer[off] = '\0'; return off; } static void param_array_free(void *arg) { unsigned int i; const struct kparam_array *arr = arg; if (arr->ops->free) for (i = 0; i < (arr->num ? *arr->num : arr->max); i++) arr->ops->free(arr->elem + arr->elemsize * i); } const struct kernel_param_ops param_array_ops = { .set = param_array_set, .get = param_array_get, .free = param_array_free, }; EXPORT_SYMBOL(param_array_ops); int param_set_copystring(const char *val, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; if (strnlen(val, kps->maxlen) == kps->maxlen) { pr_err("%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); return -ENOSPC; } strcpy(kps->string, val); return 0; } EXPORT_SYMBOL(param_set_copystring); int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string); } EXPORT_SYMBOL(param_get_string); const struct kernel_param_ops param_ops_string = { .set = param_set_copystring, .get = param_get_string, }; EXPORT_SYMBOL(param_ops_string); /* sysfs output in /sys/modules/XYZ/parameters/ */ #define to_module_attr(n) container_of(n, struct module_attribute, attr) #define to_module_kobject(n) container_of(n, struct module_kobject, kobj) struct param_attribute { struct module_attribute mattr; const struct kernel_param *param; }; struct module_param_attrs { unsigned int num; struct attribute_group grp; struct param_attribute attrs[]; }; #ifdef CONFIG_SYSFS #define to_param_attr(n) container_of(n, struct param_attribute, mattr) static ssize_t param_attr_show(struct module_attribute *mattr, struct module_kobject *mk, char *buf) { int count; struct param_attribute *attribute = to_param_attr(mattr); if (!attribute->param->ops->get) return -EPERM; kernel_param_lock(mk->mod); count = attribute->param->ops->get(buf, attribute->param); kernel_param_unlock(mk->mod); return count; } /* sysfs always hands a nul-terminated string in buf. We rely on that. */ static ssize_t param_attr_store(struct module_attribute *mattr, struct module_kobject *mk, const char *buf, size_t len) { int err; struct param_attribute *attribute = to_param_attr(mattr); if (!attribute->param->ops->set) return -EPERM; kernel_param_lock(mk->mod); if (param_check_unsafe(attribute->param)) err = attribute->param->ops->set(buf, attribute->param); else err = -EPERM; kernel_param_unlock(mk->mod); if (!err) return len; return err; } #endif #ifdef CONFIG_MODULES #define __modinit #else #define __modinit __init #endif #ifdef CONFIG_SYSFS void kernel_param_lock(struct module *mod) { mutex_lock(KPARAM_MUTEX(mod)); } void kernel_param_unlock(struct module *mod) { mutex_unlock(KPARAM_MUTEX(mod)); } EXPORT_SYMBOL(kernel_param_lock); EXPORT_SYMBOL(kernel_param_unlock); /* * add_sysfs_param - add a parameter to sysfs * @mk: struct module_kobject * @kp: the actual parameter definition to add to sysfs * @name: name of parameter * * Create a kobject if for a (per-module) parameter if mp NULL, and * create file in sysfs. Returns an error on out of memory. Always cleans up * if there's an error. */ static __modinit int add_sysfs_param(struct module_kobject *mk, const struct kernel_param *kp, const char *name) { struct module_param_attrs *new_mp; struct attribute **new_attrs; unsigned int i; /* We don't bother calling this with invisible parameters. */ BUG_ON(!kp->perm); if (!mk->mp) { /* First allocation. */ mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL); if (!mk->mp) return -ENOMEM; mk->mp->grp.name = "parameters"; /* NULL-terminated attribute array. */ mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]), GFP_KERNEL); /* Caller will cleanup via free_module_param_attrs */ if (!mk->mp->grp.attrs) return -ENOMEM; } /* Enlarge allocations. */ new_mp = krealloc(mk->mp, sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1), GFP_KERNEL); if (!new_mp) return -ENOMEM; mk->mp = new_mp; /* Extra pointer for NULL terminator */ new_attrs = krealloc(mk->mp->grp.attrs, sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2), GFP_KERNEL); if (!new_attrs) return -ENOMEM; mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0])); sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); mk->mp->attrs[mk->mp->num].param = kp; mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; /* Do not allow runtime DAC changes to make param writable. */ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; else mk->mp->attrs[mk->mp->num].mattr.store = NULL; mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; mk->mp->num++; /* Fix up all the pointers, since krealloc can move us */ for (i = 0; i < mk->mp->num; i++) mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr; mk->mp->grp.attrs[mk->mp->num] = NULL; return 0; } #ifdef CONFIG_MODULES static void free_module_param_attrs(struct module_kobject *mk) { if (mk->mp) kfree(mk->mp->grp.attrs); kfree(mk->mp); mk->mp = NULL; } /* * module_param_sysfs_setup - setup sysfs support for one module * @mod: module * @kparam: module parameters (array) * @num_params: number of module parameters * * Adds sysfs entries for module parameters under * /sys/module/[mod->name]/parameters/ */ int module_param_sysfs_setup(struct module *mod, const struct kernel_param *kparam, unsigned int num_params) { int i, err; bool params = false; for (i = 0; i < num_params; i++) { if (kparam[i].perm == 0) continue; err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); if (err) { free_module_param_attrs(&mod->mkobj); return err; } params = true; } if (!params) return 0; /* Create the param group. */ err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); if (err) free_module_param_attrs(&mod->mkobj); return err; } /* * module_param_sysfs_remove - remove sysfs support for one module * @mod: module * * Remove sysfs entries for module parameters and the corresponding * kobject. */ void module_param_sysfs_remove(struct module *mod) { if (mod->mkobj.mp) { sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); /* * We are positive that no one is using any param * attrs at this point. Deallocate immediately. */ free_module_param_attrs(&mod->mkobj); } } #endif void destroy_params(const struct kernel_param *params, unsigned num) { unsigned int i; for (i = 0; i < num; i++) if (params[i].ops->free) params[i].ops->free(params[i].arg); } static struct module_kobject * __init locate_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; int err; kobj = kset_find_obj(module_kset, name); if (kobj) { mk = to_module_kobject(kobj); } else { mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); BUG_ON(!mk); mk->mod = THIS_MODULE; mk->kobj.kset = module_kset; err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); #ifdef CONFIG_MODULES if (!err) err = sysfs_create_file(&mk->kobj, &module_uevent.attr); #endif if (err) { kobject_put(&mk->kobj); pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", name, err); return NULL; } /* So that we hold reference in both cases. */ kobject_get(&mk->kobj); } return mk; } static void __init kernel_add_sysfs_param(const char *name, const struct kernel_param *kparam, unsigned int name_skip) { struct module_kobject *mk; int err; mk = locate_module_kobject(name); if (!mk) return; /* We need to remove old parameters before adding more. */ if (mk->mp) sysfs_remove_group(&mk->kobj, &mk->mp->grp); /* These should not fail at boot. */ err = add_sysfs_param(mk, kparam, kparam->name + name_skip); BUG_ON(err); err = sysfs_create_group(&mk->kobj, &mk->mp->grp); BUG_ON(err); kobject_uevent(&mk->kobj, KOBJ_ADD); kobject_put(&mk->kobj); } /* * param_sysfs_builtin - add sysfs parameters for built-in modules * * Add module_parameters to sysfs for "modules" built into the kernel. * * The "module" name (KBUILD_MODNAME) is stored before a dot, the * "parameter" name is stored behind a dot in kernel_param->name. So, * extract the "module" name for all built-in kernel_param-eters, * and for all who have the same, call kernel_add_sysfs_param. */ static void __init param_sysfs_builtin(void) { const struct kernel_param *kp; unsigned int name_len; char modname[MODULE_NAME_LEN]; for (kp = __start___param; kp < __stop___param; kp++) { char *dot; if (kp->perm == 0) continue; dot = strchr(kp->name, '.'); if (!dot) { /* This happens for core_param() */ strcpy(modname, "kernel"); name_len = 0; } else { name_len = dot - kp->name + 1; strscpy(modname, kp->name, name_len); } kernel_add_sysfs_param(modname, kp, name_len); } } ssize_t __modver_version_show(struct module_attribute *mattr, struct module_kobject *mk, char *buf) { struct module_version_attribute *vattr = container_of(mattr, struct module_version_attribute, mattr); return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version); } extern const struct module_version_attribute __start___modver[]; extern const struct module_version_attribute __stop___modver[]; static void __init version_sysfs_builtin(void) { const struct module_version_attribute *vattr; struct module_kobject *mk; int err; for (vattr = __start___modver; vattr < __stop___modver; vattr++) { mk = locate_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); WARN_ON_ONCE(err); kobject_uevent(&mk->kobj, KOBJ_ADD); kobject_put(&mk->kobj); } } } /* module-related sysfs stuff */ static ssize_t module_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct module_attribute *attribute; struct module_kobject *mk; int ret; attribute = to_module_attr(attr); mk = to_module_kobject(kobj); if (!attribute->show) return -EIO; ret = attribute->show(attribute, mk, buf); return ret; } static ssize_t module_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { struct module_attribute *attribute; struct module_kobject *mk; int ret; attribute = to_module_attr(attr); mk = to_module_kobject(kobj); if (!attribute->store) return -EIO; ret = attribute->store(attribute, mk, buf, len); return ret; } static const struct sysfs_ops module_sysfs_ops = { .show = module_attr_show, .store = module_attr_store, }; static int uevent_filter(const struct kobject *kobj) { const struct kobj_type *ktype = get_ktype(kobj); if (ktype == &module_ktype) return 1; return 0; } static const struct kset_uevent_ops module_uevent_ops = { .filter = uevent_filter, }; struct kset *module_kset; static void module_kobj_release(struct kobject *kobj) { struct module_kobject *mk = to_module_kobject(kobj); complete(mk->kobj_completion); } const struct kobj_type module_ktype = { .release = module_kobj_release, .sysfs_ops = &module_sysfs_ops, }; /* * param_sysfs_init - create "module" kset * * This must be done before the initramfs is unpacked and * request_module() thus becomes possible, because otherwise the * module load would fail in mod_sysfs_init. */ static int __init param_sysfs_init(void) { module_kset = kset_create_and_add("module", &module_uevent_ops, NULL); if (!module_kset) { printk(KERN_WARNING "%s (%d): error creating kset\n", __FILE__, __LINE__); return -ENOMEM; } return 0; } subsys_initcall(param_sysfs_init); /* * param_sysfs_builtin_init - add sysfs version and parameter * attributes for built-in modules */ static int __init param_sysfs_builtin_init(void) { if (!module_kset) return -ENOMEM; version_sysfs_builtin(); param_sysfs_builtin(); return 0; } late_initcall(param_sysfs_builtin_init); #endif /* CONFIG_SYSFS */
2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 /* FCrypt encryption algorithm * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Based on code: * * Copyright (c) 1995 - 2000 Kungliga Tekniska Högskolan * (Royal Institute of Technology, Stockholm, Sweden). * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. Neither the name of the Institute nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <asm/byteorder.h> #include <crypto/algapi.h> #include <linux/bitops.h> #include <linux/init.h> #include <linux/module.h> #define ROUNDS 16 struct fcrypt_ctx { __be32 sched[ROUNDS]; }; /* Rotate right two 32 bit numbers as a 56 bit number */ #define ror56(hi, lo, n) \ do { \ u32 t = lo & ((1 << n) - 1); \ lo = (lo >> n) | ((hi & ((1 << n) - 1)) << (32 - n)); \ hi = (hi >> n) | (t << (24-n)); \ } while (0) /* Rotate right one 64 bit number as a 56 bit number */ #define ror56_64(k, n) (k = (k >> n) | ((k & ((1 << n) - 1)) << (56 - n))) /* * Sboxes for Feistel network derived from * /afs/transarc.com/public/afsps/afs.rel31b.export-src/rxkad/sboxes.h */ #undef Z #define Z(x) cpu_to_be32(x << 3) static const __be32 sbox0[256] = { Z(0xea), Z(0x7f), Z(0xb2), Z(0x64), Z(0x9d), Z(0xb0), Z(0xd9), Z(0x11), Z(0xcd), Z(0x86), Z(0x86), Z(0x91), Z(0x0a), Z(0xb2), Z(0x93), Z(0x06), Z(0x0e), Z(0x06), Z(0xd2), Z(0x65), Z(0x73), Z(0xc5), Z(0x28), Z(0x60), Z(0xf2), Z(0x20), Z(0xb5), Z(0x38), Z(0x7e), Z(0xda), Z(0x9f), Z(0xe3), Z(0xd2), Z(0xcf), Z(0xc4), Z(0x3c), Z(0x61), Z(0xff), Z(0x4a), Z(0x4a), Z(0x35), Z(0xac), Z(0xaa), Z(0x5f), Z(0x2b), Z(0xbb), Z(0xbc), Z(0x53), Z(0x4e), Z(0x9d), Z(0x78), Z(0xa3), Z(0xdc), Z(0x09), Z(0x32), Z(0x10), Z(0xc6), Z(0x6f), Z(0x66), Z(0xd6), Z(0xab), Z(0xa9), Z(0xaf), Z(0xfd), Z(0x3b), Z(0x95), Z(0xe8), Z(0x34), Z(0x9a), Z(0x81), Z(0x72), Z(0x80), Z(0x9c), Z(0xf3), Z(0xec), Z(0xda), Z(0x9f), Z(0x26), Z(0x76), Z(0x15), Z(0x3e), Z(0x55), Z(0x4d), Z(0xde), Z(0x84), Z(0xee), Z(0xad), Z(0xc7), Z(0xf1), Z(0x6b), Z(0x3d), Z(0xd3), Z(0x04), Z(0x49), Z(0xaa), Z(0x24), Z(0x0b), Z(0x8a), Z(0x83), Z(0xba), Z(0xfa), Z(0x85), Z(0xa0), Z(0xa8), Z(0xb1), Z(0xd4), Z(0x01), Z(0xd8), Z(0x70), Z(0x64), Z(0xf0), Z(0x51), Z(0xd2), Z(0xc3), Z(0xa7), Z(0x75), Z(0x8c), Z(0xa5), Z(0x64), Z(0xef), Z(0x10), Z(0x4e), Z(0xb7), Z(0xc6), Z(0x61), Z(0x03), Z(0xeb), Z(0x44), Z(0x3d), Z(0xe5), Z(0xb3), Z(0x5b), Z(0xae), Z(0xd5), Z(0xad), Z(0x1d), Z(0xfa), Z(0x5a), Z(0x1e), Z(0x33), Z(0xab), Z(0x93), Z(0xa2), Z(0xb7), Z(0xe7), Z(0xa8), Z(0x45), Z(0xa4), Z(0xcd), Z(0x29), Z(0x63), Z(0x44), Z(0xb6), Z(0x69), Z(0x7e), Z(0x2e), Z(0x62), Z(0x03), Z(0xc8), Z(0xe0), Z(0x17), Z(0xbb), Z(0xc7), Z(0xf3), Z(0x3f), Z(0x36), Z(0xba), Z(0x71), Z(0x8e), Z(0x97), Z(0x65), Z(0x60), Z(0x69), Z(0xb6), Z(0xf6), Z(0xe6), Z(0x6e), Z(0xe0), Z(0x81), Z(0x59), Z(0xe8), Z(0xaf), Z(0xdd), Z(0x95), Z(0x22), Z(0x99), Z(0xfd), Z(0x63), Z(0x19), Z(0x74), Z(0x61), Z(0xb1), Z(0xb6), Z(0x5b), Z(0xae), Z(0x54), Z(0xb3), Z(0x70), Z(0xff), Z(0xc6), Z(0x3b), Z(0x3e), Z(0xc1), Z(0xd7), Z(0xe1), Z(0x0e), Z(0x76), Z(0xe5), Z(0x36), Z(0x4f), Z(0x59), Z(0xc7), Z(0x08), Z(0x6e), Z(0x82), Z(0xa6), Z(0x93), Z(0xc4), Z(0xaa), Z(0x26), Z(0x49), Z(0xe0), Z(0x21), Z(0x64), Z(0x07), Z(0x9f), Z(0x64), Z(0x81), Z(0x9c), Z(0xbf), Z(0xf9), Z(0xd1), Z(0x43), Z(0xf8), Z(0xb6), Z(0xb9), Z(0xf1), Z(0x24), Z(0x75), Z(0x03), Z(0xe4), Z(0xb0), Z(0x99), Z(0x46), Z(0x3d), Z(0xf5), Z(0xd1), Z(0x39), Z(0x72), Z(0x12), Z(0xf6), Z(0xba), Z(0x0c), Z(0x0d), Z(0x42), Z(0x2e) }; #undef Z #define Z(x) cpu_to_be32(((x & 0x1f) << 27) | (x >> 5)) static const __be32 sbox1[256] = { Z(0x77), Z(0x14), Z(0xa6), Z(0xfe), Z(0xb2), Z(0x5e), Z(0x8c), Z(0x3e), Z(0x67), Z(0x6c), Z(0xa1), Z(0x0d), Z(0xc2), Z(0xa2), Z(0xc1), Z(0x85), Z(0x6c), Z(0x7b), Z(0x67), Z(0xc6), Z(0x23), Z(0xe3), Z(0xf2), Z(0x89), Z(0x50), Z(0x9c), Z(0x03), Z(0xb7), Z(0x73), Z(0xe6), Z(0xe1), Z(0x39), Z(0x31), Z(0x2c), Z(0x27), Z(0x9f), Z(0xa5), Z(0x69), Z(0x44), Z(0xd6), Z(0x23), Z(0x83), Z(0x98), Z(0x7d), Z(0x3c), Z(0xb4), Z(0x2d), Z(0x99), Z(0x1c), Z(0x1f), Z(0x8c), Z(0x20), Z(0x03), Z(0x7c), Z(0x5f), Z(0xad), Z(0xf4), Z(0xfa), Z(0x95), Z(0xca), Z(0x76), Z(0x44), Z(0xcd), Z(0xb6), Z(0xb8), Z(0xa1), Z(0xa1), Z(0xbe), Z(0x9e), Z(0x54), Z(0x8f), Z(0x0b), Z(0x16), Z(0x74), Z(0x31), Z(0x8a), Z(0x23), Z(0x17), Z(0x04), Z(0xfa), Z(0x79), Z(0x84), Z(0xb1), Z(0xf5), Z(0x13), Z(0xab), Z(0xb5), Z(0x2e), Z(0xaa), Z(0x0c), Z(0x60), Z(0x6b), Z(0x5b), Z(0xc4), Z(0x4b), Z(0xbc), Z(0xe2), Z(0xaf), Z(0x45), Z(0x73), Z(0xfa), Z(0xc9), Z(0x49), Z(0xcd), Z(0x00), Z(0x92), Z(0x7d), Z(0x97), Z(0x7a), Z(0x18), Z(0x60), Z(0x3d), Z(0xcf), Z(0x5b), Z(0xde), Z(0xc6), Z(0xe2), Z(0xe6), Z(0xbb), Z(0x8b), Z(0x06), Z(0xda), Z(0x08), Z(0x15), Z(0x1b), Z(0x88), Z(0x6a), Z(0x17), Z(0x89), Z(0xd0), Z(0xa9), Z(0xc1), Z(0xc9), Z(0x70), Z(0x6b), Z(0xe5), Z(0x43), Z(0xf4), Z(0x68), Z(0xc8), Z(0xd3), Z(0x84), Z(0x28), Z(0x0a), Z(0x52), Z(0x66), Z(0xa3), Z(0xca), Z(0xf2), Z(0xe3), Z(0x7f), Z(0x7a), Z(0x31), Z(0xf7), Z(0x88), Z(0x94), Z(0x5e), Z(0x9c), Z(0x63), Z(0xd5), Z(0x24), Z(0x66), Z(0xfc), Z(0xb3), Z(0x57), Z(0x25), Z(0xbe), Z(0x89), Z(0x44), Z(0xc4), Z(0xe0), Z(0x8f), Z(0x23), Z(0x3c), Z(0x12), Z(0x52), Z(0xf5), Z(0x1e), Z(0xf4), Z(0xcb), Z(0x18), Z(0x33), Z(0x1f), Z(0xf8), Z(0x69), Z(0x10), Z(0x9d), Z(0xd3), Z(0xf7), Z(0x28), Z(0xf8), Z(0x30), Z(0x05), Z(0x5e), Z(0x32), Z(0xc0), Z(0xd5), Z(0x19), Z(0xbd), Z(0x45), Z(0x8b), Z(0x5b), Z(0xfd), Z(0xbc), Z(0xe2), Z(0x5c), Z(0xa9), Z(0x96), Z(0xef), Z(0x70), Z(0xcf), Z(0xc2), Z(0x2a), Z(0xb3), Z(0x61), Z(0xad), Z(0x80), Z(0x48), Z(0x81), Z(0xb7), Z(0x1d), Z(0x43), Z(0xd9), Z(0xd7), Z(0x45), Z(0xf0), Z(0xd8), Z(0x8a), Z(0x59), Z(0x7c), Z(0x57), Z(0xc1), Z(0x79), Z(0xc7), Z(0x34), Z(0xd6), Z(0x43), Z(0xdf), Z(0xe4), Z(0x78), Z(0x16), Z(0x06), Z(0xda), Z(0x92), Z(0x76), Z(0x51), Z(0xe1), Z(0xd4), Z(0x70), Z(0x03), Z(0xe0), Z(0x2f), Z(0x96), Z(0x91), Z(0x82), Z(0x80) }; #undef Z #define Z(x) cpu_to_be32(x << 11) static const __be32 sbox2[256] = { Z(0xf0), Z(0x37), Z(0x24), Z(0x53), Z(0x2a), Z(0x03), Z(0x83), Z(0x86), Z(0xd1), Z(0xec), Z(0x50), Z(0xf0), Z(0x42), Z(0x78), Z(0x2f), Z(0x6d), Z(0xbf), Z(0x80), Z(0x87), Z(0x27), Z(0x95), Z(0xe2), Z(0xc5), Z(0x5d), Z(0xf9), Z(0x6f), Z(0xdb), Z(0xb4), Z(0x65), Z(0x6e), Z(0xe7), Z(0x24), Z(0xc8), Z(0x1a), Z(0xbb), Z(0x49), Z(0xb5), Z(0x0a), Z(0x7d), Z(0xb9), Z(0xe8), Z(0xdc), Z(0xb7), Z(0xd9), Z(0x45), Z(0x20), Z(0x1b), Z(0xce), Z(0x59), Z(0x9d), Z(0x6b), Z(0xbd), Z(0x0e), Z(0x8f), Z(0xa3), Z(0xa9), Z(0xbc), Z(0x74), Z(0xa6), Z(0xf6), Z(0x7f), Z(0x5f), Z(0xb1), Z(0x68), Z(0x84), Z(0xbc), Z(0xa9), Z(0xfd), Z(0x55), Z(0x50), Z(0xe9), Z(0xb6), Z(0x13), Z(0x5e), Z(0x07), Z(0xb8), Z(0x95), Z(0x02), Z(0xc0), Z(0xd0), Z(0x6a), Z(0x1a), Z(0x85), Z(0xbd), Z(0xb6), Z(0xfd), Z(0xfe), Z(0x17), Z(0x3f), Z(0x09), Z(0xa3), Z(0x8d), Z(0xfb), Z(0xed), Z(0xda), Z(0x1d), Z(0x6d), Z(0x1c), Z(0x6c), Z(0x01), Z(0x5a), Z(0xe5), Z(0x71), Z(0x3e), Z(0x8b), Z(0x6b), Z(0xbe), Z(0x29), Z(0xeb), Z(0x12), Z(0x19), Z(0x34), Z(0xcd), Z(0xb3), Z(0xbd), Z(0x35), Z(0xea), Z(0x4b), Z(0xd5), Z(0xae), Z(0x2a), Z(0x79), Z(0x5a), Z(0xa5), Z(0x32), Z(0x12), Z(0x7b), Z(0xdc), Z(0x2c), Z(0xd0), Z(0x22), Z(0x4b), Z(0xb1), Z(0x85), Z(0x59), Z(0x80), Z(0xc0), Z(0x30), Z(0x9f), Z(0x73), Z(0xd3), Z(0x14), Z(0x48), Z(0x40), Z(0x07), Z(0x2d), Z(0x8f), Z(0x80), Z(0x0f), Z(0xce), Z(0x0b), Z(0x5e), Z(0xb7), Z(0x5e), Z(0xac), Z(0x24), Z(0x94), Z(0x4a), Z(0x18), Z(0x15), Z(0x05), Z(0xe8), Z(0x02), Z(0x77), Z(0xa9), Z(0xc7), Z(0x40), Z(0x45), Z(0x89), Z(0xd1), Z(0xea), Z(0xde), Z(0x0c), Z(0x79), Z(0x2a), Z(0x99), Z(0x6c), Z(0x3e), Z(0x95), Z(0xdd), Z(0x8c), Z(0x7d), Z(0xad), Z(0x6f), Z(0xdc), Z(0xff), Z(0xfd), Z(0x62), Z(0x47), Z(0xb3), Z(0x21), Z(0x8a), Z(0xec), Z(0x8e), Z(0x19), Z(0x18), Z(0xb4), Z(0x6e), Z(0x3d), Z(0xfd), Z(0x74), Z(0x54), Z(0x1e), Z(0x04), Z(0x85), Z(0xd8), Z(0xbc), Z(0x1f), Z(0x56), Z(0xe7), Z(0x3a), Z(0x56), Z(0x67), Z(0xd6), Z(0xc8), Z(0xa5), Z(0xf3), Z(0x8e), Z(0xde), Z(0xae), Z(0x37), Z(0x49), Z(0xb7), Z(0xfa), Z(0xc8), Z(0xf4), Z(0x1f), Z(0xe0), Z(0x2a), Z(0x9b), Z(0x15), Z(0xd1), Z(0x34), Z(0x0e), Z(0xb5), Z(0xe0), Z(0x44), Z(0x78), Z(0x84), Z(0x59), Z(0x56), Z(0x68), Z(0x77), Z(0xa5), Z(0x14), Z(0x06), Z(0xf5), Z(0x2f), Z(0x8c), Z(0x8a), Z(0x73), Z(0x80), Z(0x76), Z(0xb4), Z(0x10), Z(0x86) }; #undef Z #define Z(x) cpu_to_be32(x << 19) static const __be32 sbox3[256] = { Z(0xa9), Z(0x2a), Z(0x48), Z(0x51), Z(0x84), Z(0x7e), Z(0x49), Z(0xe2), Z(0xb5), Z(0xb7), Z(0x42), Z(0x33), Z(0x7d), Z(0x5d), Z(0xa6), Z(0x12), Z(0x44), Z(0x48), Z(0x6d), Z(0x28), Z(0xaa), Z(0x20), Z(0x6d), Z(0x57), Z(0xd6), Z(0x6b), Z(0x5d), Z(0x72), Z(0xf0), Z(0x92), Z(0x5a), Z(0x1b), Z(0x53), Z(0x80), Z(0x24), Z(0x70), Z(0x9a), Z(0xcc), Z(0xa7), Z(0x66), Z(0xa1), Z(0x01), Z(0xa5), Z(0x41), Z(0x97), Z(0x41), Z(0x31), Z(0x82), Z(0xf1), Z(0x14), Z(0xcf), Z(0x53), Z(0x0d), Z(0xa0), Z(0x10), Z(0xcc), Z(0x2a), Z(0x7d), Z(0xd2), Z(0xbf), Z(0x4b), Z(0x1a), Z(0xdb), Z(0x16), Z(0x47), Z(0xf6), Z(0x51), Z(0x36), Z(0xed), Z(0xf3), Z(0xb9), Z(0x1a), Z(0xa7), Z(0xdf), Z(0x29), Z(0x43), Z(0x01), Z(0x54), Z(0x70), Z(0xa4), Z(0xbf), Z(0xd4), Z(0x0b), Z(0x53), Z(0x44), Z(0x60), Z(0x9e), Z(0x23), Z(0xa1), Z(0x18), Z(0x68), Z(0x4f), Z(0xf0), Z(0x2f), Z(0x82), Z(0xc2), Z(0x2a), Z(0x41), Z(0xb2), Z(0x42), Z(0x0c), Z(0xed), Z(0x0c), Z(0x1d), Z(0x13), Z(0x3a), Z(0x3c), Z(0x6e), Z(0x35), Z(0xdc), Z(0x60), Z(0x65), Z(0x85), Z(0xe9), Z(0x64), Z(0x02), Z(0x9a), Z(0x3f), Z(0x9f), Z(0x87), Z(0x96), Z(0xdf), Z(0xbe), Z(0xf2), Z(0xcb), Z(0xe5), Z(0x6c), Z(0xd4), Z(0x5a), Z(0x83), Z(0xbf), Z(0x92), Z(0x1b), Z(0x94), Z(0x00), Z(0x42), Z(0xcf), Z(0x4b), Z(0x00), Z(0x75), Z(0xba), Z(0x8f), Z(0x76), Z(0x5f), Z(0x5d), Z(0x3a), Z(0x4d), Z(0x09), Z(0x12), Z(0x08), Z(0x38), Z(0x95), Z(0x17), Z(0xe4), Z(0x01), Z(0x1d), Z(0x4c), Z(0xa9), Z(0xcc), Z(0x85), Z(0x82), Z(0x4c), Z(0x9d), Z(0x2f), Z(0x3b), Z(0x66), Z(0xa1), Z(0x34), Z(0x10), Z(0xcd), Z(0x59), Z(0x89), Z(0xa5), Z(0x31), Z(0xcf), Z(0x05), Z(0xc8), Z(0x84), Z(0xfa), Z(0xc7), Z(0xba), Z(0x4e), Z(0x8b), Z(0x1a), Z(0x19), Z(0xf1), Z(0xa1), Z(0x3b), Z(0x18), Z(0x12), Z(0x17), Z(0xb0), Z(0x98), Z(0x8d), Z(0x0b), Z(0x23), Z(0xc3), Z(0x3a), Z(0x2d), Z(0x20), Z(0xdf), Z(0x13), Z(0xa0), Z(0xa8), Z(0x4c), Z(0x0d), Z(0x6c), Z(0x2f), Z(0x47), Z(0x13), Z(0x13), Z(0x52), Z(0x1f), Z(0x2d), Z(0xf5), Z(0x79), Z(0x3d), Z(0xa2), Z(0x54), Z(0xbd), Z(0x69), Z(0xc8), Z(0x6b), Z(0xf3), Z(0x05), Z(0x28), Z(0xf1), Z(0x16), Z(0x46), Z(0x40), Z(0xb0), Z(0x11), Z(0xd3), Z(0xb7), Z(0x95), Z(0x49), Z(0xcf), Z(0xc3), Z(0x1d), Z(0x8f), Z(0xd8), Z(0xe1), Z(0x73), Z(0xdb), Z(0xad), Z(0xc8), Z(0xc9), Z(0xa9), Z(0xa1), Z(0xc2), Z(0xc5), Z(0xe3), Z(0xba), Z(0xfc), Z(0x0e), Z(0x25) }; /* * This is a 16 round Feistel network with permutation F_ENCRYPT */ #define F_ENCRYPT(R, L, sched) \ do { \ union lc4 { __be32 l; u8 c[4]; } u; \ u.l = sched ^ R; \ L ^= sbox0[u.c[0]] ^ sbox1[u.c[1]] ^ sbox2[u.c[2]] ^ sbox3[u.c[3]]; \ } while (0) /* * encryptor */ static void fcrypt_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { const struct fcrypt_ctx *ctx = crypto_tfm_ctx(tfm); struct { __be32 l, r; } X; memcpy(&X, src, sizeof(X)); F_ENCRYPT(X.r, X.l, ctx->sched[0x0]); F_ENCRYPT(X.l, X.r, ctx->sched[0x1]); F_ENCRYPT(X.r, X.l, ctx->sched[0x2]); F_ENCRYPT(X.l, X.r, ctx->sched[0x3]); F_ENCRYPT(X.r, X.l, ctx->sched[0x4]); F_ENCRYPT(X.l, X.r, ctx->sched[0x5]); F_ENCRYPT(X.r, X.l, ctx->sched[0x6]); F_ENCRYPT(X.l, X.r, ctx->sched[0x7]); F_ENCRYPT(X.r, X.l, ctx->sched[0x8]); F_ENCRYPT(X.l, X.r, ctx->sched[0x9]); F_ENCRYPT(X.r, X.l, ctx->sched[0xa]); F_ENCRYPT(X.l, X.r, ctx->sched[0xb]); F_ENCRYPT(X.r, X.l, ctx->sched[0xc]); F_ENCRYPT(X.l, X.r, ctx->sched[0xd]); F_ENCRYPT(X.r, X.l, ctx->sched[0xe]); F_ENCRYPT(X.l, X.r, ctx->sched[0xf]); memcpy(dst, &X, sizeof(X)); } /* * decryptor */ static void fcrypt_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { const struct fcrypt_ctx *ctx = crypto_tfm_ctx(tfm); struct { __be32 l, r; } X; memcpy(&X, src, sizeof(X)); F_ENCRYPT(X.l, X.r, ctx->sched[0xf]); F_ENCRYPT(X.r, X.l, ctx->sched[0xe]); F_ENCRYPT(X.l, X.r, ctx->sched[0xd]); F_ENCRYPT(X.r, X.l, ctx->sched[0xc]); F_ENCRYPT(X.l, X.r, ctx->sched[0xb]); F_ENCRYPT(X.r, X.l, ctx->sched[0xa]); F_ENCRYPT(X.l, X.r, ctx->sched[0x9]); F_ENCRYPT(X.r, X.l, ctx->sched[0x8]); F_ENCRYPT(X.l, X.r, ctx->sched[0x7]); F_ENCRYPT(X.r, X.l, ctx->sched[0x6]); F_ENCRYPT(X.l, X.r, ctx->sched[0x5]); F_ENCRYPT(X.r, X.l, ctx->sched[0x4]); F_ENCRYPT(X.l, X.r, ctx->sched[0x3]); F_ENCRYPT(X.r, X.l, ctx->sched[0x2]); F_ENCRYPT(X.l, X.r, ctx->sched[0x1]); F_ENCRYPT(X.r, X.l, ctx->sched[0x0]); memcpy(dst, &X, sizeof(X)); } /* * Generate a key schedule from key, the least significant bit in each key byte * is parity and shall be ignored. This leaves 56 significant bits in the key * to scatter over the 16 key schedules. For each schedule extract the low * order 32 bits and use as schedule, then rotate right by 11 bits. */ static int fcrypt_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { struct fcrypt_ctx *ctx = crypto_tfm_ctx(tfm); #if BITS_PER_LONG == 64 /* the 64-bit version can also be used for 32-bit * kernels - it seems to be faster but the code is * larger */ u64 k; /* k holds all 56 non-parity bits */ /* discard the parity bits */ k = (*key++) >> 1; k <<= 7; k |= (*key++) >> 1; k <<= 7; k |= (*key++) >> 1; k <<= 7; k |= (*key++) >> 1; k <<= 7; k |= (*key++) >> 1; k <<= 7; k |= (*key++) >> 1; k <<= 7; k |= (*key++) >> 1; k <<= 7; k |= (*key) >> 1; /* Use lower 32 bits for schedule, rotate by 11 each round (16 times) */ ctx->sched[0x0] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0x1] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0x2] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0x3] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0x4] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0x5] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0x6] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0x7] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0x8] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0x9] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0xa] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0xb] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0xc] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0xd] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0xe] = cpu_to_be32(k); ror56_64(k, 11); ctx->sched[0xf] = cpu_to_be32(k); return 0; #else u32 hi, lo; /* hi is upper 24 bits and lo lower 32, total 56 */ /* discard the parity bits */ lo = (*key++) >> 1; lo <<= 7; lo |= (*key++) >> 1; lo <<= 7; lo |= (*key++) >> 1; lo <<= 7; lo |= (*key++) >> 1; hi = lo >> 4; lo &= 0xf; lo <<= 7; lo |= (*key++) >> 1; lo <<= 7; lo |= (*key++) >> 1; lo <<= 7; lo |= (*key++) >> 1; lo <<= 7; lo |= (*key) >> 1; /* Use lower 32 bits for schedule, rotate by 11 each round (16 times) */ ctx->sched[0x0] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0x1] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0x2] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0x3] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0x4] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0x5] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0x6] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0x7] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0x8] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0x9] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0xa] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0xb] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0xc] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0xd] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0xe] = cpu_to_be32(lo); ror56(hi, lo, 11); ctx->sched[0xf] = cpu_to_be32(lo); return 0; #endif } static struct crypto_alg fcrypt_alg = { .cra_name = "fcrypt", .cra_driver_name = "fcrypt-generic", .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = 8, .cra_ctxsize = sizeof(struct fcrypt_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = 8, .cia_max_keysize = 8, .cia_setkey = fcrypt_setkey, .cia_encrypt = fcrypt_encrypt, .cia_decrypt = fcrypt_decrypt } } }; static int __init fcrypt_mod_init(void) { return crypto_register_alg(&fcrypt_alg); } static void __exit fcrypt_mod_fini(void) { crypto_unregister_alg(&fcrypt_alg); } subsys_initcall(fcrypt_mod_init); module_exit(fcrypt_mod_fini); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("FCrypt Cipher Algorithm"); MODULE_AUTHOR("David Howells <dhowells@redhat.com>"); MODULE_ALIAS_CRYPTO("fcrypt");
9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 // SPDX-License-Identifier: GPL-2.0-or-later /* client.c: NFS client sharing and management code * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/errno.h> #include <linux/unistd.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> #include <linux/sunrpc/xprtsock.h> #include <linux/sunrpc/xprtrdma.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> #include <linux/slab.h> #include <linux/idr.h> #include <net/ipv6.h> #include <linux/nfs_xdr.h> #include <linux/sunrpc/bc_xprt.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "iostat.h" #include "internal.h" #include "fscache.h" #include "pnfs.h" #include "nfs.h" #include "netns.h" #include "sysfs.h" #include "nfs42.h" #define NFSDBG_FACILITY NFSDBG_CLIENT static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); static DEFINE_RWLOCK(nfs_version_lock); static struct nfs_subversion *nfs_version_mods[5] = { [2] = NULL, [3] = NULL, [4] = NULL, }; /* * RPC cruft for NFS */ static const struct rpc_version *nfs_version[5] = { [2] = NULL, [3] = NULL, [4] = NULL, }; const struct rpc_program nfs_program = { .name = "nfs", .number = NFS_PROGRAM, .nrvers = ARRAY_SIZE(nfs_version), .version = nfs_version, .pipe_dir_name = NFS_PIPE_DIRNAME, }; static struct nfs_subversion *__find_nfs_version(unsigned int version) { struct nfs_subversion *nfs; read_lock(&nfs_version_lock); nfs = nfs_version_mods[version]; read_unlock(&nfs_version_lock); return nfs; } struct nfs_subversion *find_nfs_version(unsigned int version) { struct nfs_subversion *nfs = __find_nfs_version(version); if (!nfs && request_module("nfsv%d", version) == 0) nfs = __find_nfs_version(version); if (!nfs) return ERR_PTR(-EPROTONOSUPPORT); if (!get_nfs_version(nfs)) return ERR_PTR(-EAGAIN); return nfs; } int get_nfs_version(struct nfs_subversion *nfs) { return try_module_get(nfs->owner); } EXPORT_SYMBOL_GPL(get_nfs_version); void put_nfs_version(struct nfs_subversion *nfs) { module_put(nfs->owner); } void register_nfs_version(struct nfs_subversion *nfs) { write_lock(&nfs_version_lock); nfs_version_mods[nfs->rpc_ops->version] = nfs; nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(register_nfs_version); void unregister_nfs_version(struct nfs_subversion *nfs) { write_lock(&nfs_version_lock); nfs_version[nfs->rpc_ops->version] = NULL; nfs_version_mods[nfs->rpc_ops->version] = NULL; write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(unregister_nfs_version); /* * Allocate a shared client record * * Since these are allocated/deallocated very rarely, we don't * bother putting them in a slab cache... */ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp; int err = -ENOMEM; if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) goto error_0; clp->cl_minorversion = cl_init->minorversion; clp->cl_nfs_mod = cl_init->nfs_mod; if (!get_nfs_version(clp->cl_nfs_mod)) goto error_dealloc; clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; refcount_set(&clp->cl_count, 1); clp->cl_cons_state = NFS_CS_INITING; memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); clp->cl_addrlen = cl_init->addrlen; if (cl_init->hostname) { err = -ENOMEM; clp->cl_hostname = kstrdup(cl_init->hostname, GFP_KERNEL); if (!clp->cl_hostname) goto error_cleanup; } INIT_LIST_HEAD(&clp->cl_superblocks); clp->cl_rpcclient = ERR_PTR(-EINVAL); clp->cl_flags = cl_init->init_flags; clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; clp->cl_net = get_net(cl_init->net); #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); ktime_get_real_ts64(&clp->cl_nfssvc_boot); nfs_uuid_init(&clp->cl_uuid); spin_lock_init(&clp->cl_localio_lock); #endif /* CONFIG_NFS_LOCALIO */ clp->cl_principal = "*"; clp->cl_xprtsec = cl_init->xprtsec; return clp; error_cleanup: put_nfs_version(clp->cl_nfs_mod); error_dealloc: kfree(clp); error_0: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(nfs_alloc_client); #if IS_ENABLED(CONFIG_NFS_V4) static void nfs_cleanup_cb_ident_idr(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); idr_destroy(&nn->cb_ident_idr); } /* nfs_client_lock held */ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); if (clp->cl_cb_ident) idr_remove(&nn->cb_ident_idr, clp->cl_cb_ident); } static void pnfs_init_server(struct nfs_server *server) { rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); } #else static void nfs_cleanup_cb_ident_idr(struct net *net) { } static void nfs_cb_idr_remove_locked(struct nfs_client *clp) { } static void pnfs_init_server(struct nfs_server *server) { } #endif /* CONFIG_NFS_V4 */ /* * Destroy a shared client record */ void nfs_free_client(struct nfs_client *clp) { nfs_local_disable(clp); /* -EIO all pending I/O */ if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); put_net(clp->cl_net); put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); kfree_rcu(clp, rcu); } EXPORT_SYMBOL_GPL(nfs_free_client); /* * Release a reference to a shared client record */ void nfs_put_client(struct nfs_client *clp) { struct nfs_net *nn; if (!clp) return; nn = net_generic(clp->cl_net, nfs_net_id); if (refcount_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { list_del(&clp->cl_share_link); nfs_cb_idr_remove_locked(clp); spin_unlock(&nn->nfs_client_lock); WARN_ON_ONCE(!list_empty(&clp->cl_superblocks)); clp->rpc_ops->free_client(clp); } } EXPORT_SYMBOL_GPL(nfs_put_client); /* * Find an nfs_client on the list that matches the initialisation data * that is supplied. */ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; const struct sockaddr *sap = (struct sockaddr *)data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); int error; again: list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; /* If a client is still initializing then we need to wait */ if (clp->cl_cons_state > NFS_CS_READY) { refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); error = nfs_wait_client_init_complete(clp); nfs_put_client(clp); spin_lock(&nn->nfs_client_lock); if (error < 0) return ERR_PTR(error); goto again; } /* Different NFS versions cannot share the same nfs_client */ if (clp->rpc_ops != data->nfs_mod->rpc_ops) continue; if (clp->cl_proto != data->proto) continue; /* Match nfsv4 minorversion */ if (clp->cl_minorversion != data->minorversion) continue; /* Match request for a dedicated DS */ if (test_bit(NFS_CS_DS, &data->init_flags) != test_bit(NFS_CS_DS, &clp->cl_flags)) continue; /* Match the full socket address */ if (!rpc_cmp_addr_port(sap, clap)) /* Match all xprt_switch full socket addresses */ if (IS_ERR(clp->cl_rpcclient) || !rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient, sap)) continue; /* Match the xprt security policy */ if (clp->cl_xprtsec.policy != data->xprtsec.policy) continue; refcount_inc(&clp->cl_count); return clp; } return NULL; } /* * Return true if @clp is done initializing, false if still working on it. * * Use nfs_client_init_status to check if it was successful. */ bool nfs_client_init_is_complete(const struct nfs_client *clp) { return clp->cl_cons_state <= NFS_CS_READY; } EXPORT_SYMBOL_GPL(nfs_client_init_is_complete); /* * Return 0 if @clp was successfully initialized, -errno otherwise. * * This must be called *after* nfs_client_init_is_complete() returns true, * otherwise it will pop WARN_ON_ONCE and return -EINVAL */ int nfs_client_init_status(const struct nfs_client *clp) { /* called without checking nfs_client_init_is_complete */ if (clp->cl_cons_state > NFS_CS_READY) { WARN_ON_ONCE(1); return -EINVAL; } return clp->cl_cons_state; } EXPORT_SYMBOL_GPL(nfs_client_init_status); int nfs_wait_client_init_complete(const struct nfs_client *clp) { return wait_event_killable(nfs_client_active_wq, nfs_client_init_is_complete(clp)); } EXPORT_SYMBOL_GPL(nfs_wait_client_init_complete); /* * Found an existing client. Make sure it's ready before returning. */ static struct nfs_client * nfs_found_client(const struct nfs_client_initdata *cl_init, struct nfs_client *clp) { int error; error = nfs_wait_client_init_complete(clp); if (error < 0) { nfs_put_client(clp); return ERR_PTR(-ERESTARTSYS); } if (clp->cl_cons_state < NFS_CS_READY) { error = clp->cl_cons_state; nfs_put_client(clp); return ERR_PTR(error); } smp_rmb(); return clp; } /* * Look up a client by IP address and protocol version * - creates a new record if one doesn't yet exist */ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp, *new = NULL; struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops; if (cl_init->hostname == NULL) { WARN_ON(1); return ERR_PTR(-EINVAL); } /* see if the client already exists */ do { spin_lock(&nn->nfs_client_lock); clp = nfs_match_client(cl_init); if (clp) { spin_unlock(&nn->nfs_client_lock); if (new) new->rpc_ops->free_client(new); if (IS_ERR(clp)) return clp; return nfs_found_client(cl_init, clp); } if (new) { list_add_tail(&new->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new = rpc_ops->init_client(new, cl_init); if (!IS_ERR(new)) nfs_local_probe(new); return new; } spin_unlock(&nn->nfs_client_lock); new = rpc_ops->alloc_client(cl_init); } while (!IS_ERR(new)); return new; } EXPORT_SYMBOL_GPL(nfs_get_client); /* * Mark a server as ready or failed */ void nfs_mark_client_ready(struct nfs_client *clp, int state) { smp_wmb(); clp->cl_cons_state = state; wake_up_all(&nfs_client_active_wq); } EXPORT_SYMBOL_GPL(nfs_mark_client_ready); /* * Initialise the timeout values for a connection */ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans) { to->to_initval = timeo * HZ / 10; to->to_retries = retrans; switch (proto) { case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_TCP_TLS: case XPRT_TRANSPORT_RDMA: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; to->to_increment = to->to_initval; to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); if (to->to_maxval > NFS_MAX_TCP_TIMEOUT) to->to_maxval = NFS_MAX_TCP_TIMEOUT; if (to->to_maxval < to->to_initval) to->to_maxval = to->to_initval; to->to_exponential = 0; break; case XPRT_TRANSPORT_UDP: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_UDP_RETRANS; if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_UDP_TIMEOUT) to->to_initval = NFS_MAX_UDP_TIMEOUT; to->to_maxval = NFS_MAX_UDP_TIMEOUT; to->to_exponential = 1; break; default: BUG(); } } EXPORT_SYMBOL_GPL(nfs_init_timeout_values); /* * Create an RPC client handle */ int nfs_create_rpc_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init, rpc_authflavor_t flavor) { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); struct rpc_clnt *clnt = NULL; struct rpc_create_args args = { .net = clp->cl_net, .protocol = clp->cl_proto, .nconnect = clp->cl_nconnect, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, .timeout = cl_init->timeparms, .servername = clp->cl_hostname, .nodename = cl_init->nodename, .program = &nfs_program, .stats = &nn->rpcstats, .version = clp->rpc_ops->version, .authflavor = flavor, .cred = cl_init->cred, .xprtsec = cl_init->xprtsec, .connect_timeout = cl_init->connect_timeout, .reconnect_timeout = cl_init->reconnect_timeout, }; if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_DISCRTRY; if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT; if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS; if (test_bit(NFS_CS_NOPING, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_NOPING; if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_REUSEPORT; if (!IS_ERR(clp->cl_rpcclient)) return 0; clnt = rpc_create(&args); if (IS_ERR(clnt)) { dprintk("%s: cannot create RPC client. Error = %ld\n", __func__, PTR_ERR(clnt)); return PTR_ERR(clnt); } clnt->cl_principal = clp->cl_principal; clp->cl_rpcclient = clnt; clnt->cl_max_connect = clp->cl_max_connect; return 0; } EXPORT_SYMBOL_GPL(nfs_create_rpc_client); /* * Version 2 or 3 client destruction */ static void nfs_destroy_server(struct nfs_server *server) { if (server->nlm_host) nlmclnt_done(server->nlm_host); } /* * Version 2 or 3 lockd setup */ static int nfs_start_lockd(struct nfs_server *server) { struct nlm_host *host; struct nfs_client *clp = server->nfs_client; struct nlmclnt_initdata nlm_init = { .hostname = clp->cl_hostname, .address = (struct sockaddr *)&clp->cl_addr, .addrlen = clp->cl_addrlen, .nfs_version = clp->rpc_ops->version, .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 1 : 0, .net = clp->cl_net, .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops, .cred = server->cred, }; if (nlm_init.nfs_version > 3) return 0; if ((server->flags & NFS_MOUNT_LOCAL_FLOCK) && (server->flags & NFS_MOUNT_LOCAL_FCNTL)) return 0; switch (clp->cl_proto) { default: nlm_init.protocol = IPPROTO_TCP; break; #ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT case XPRT_TRANSPORT_UDP: nlm_init.protocol = IPPROTO_UDP; #endif } host = nlmclnt_init(&nlm_init); if (IS_ERR(host)) return PTR_ERR(host); server->nlm_host = host; server->destroy = nfs_destroy_server; nfs_sysfs_link_rpc_client(server, nlmclnt_rpc_clnt(host), NULL); return 0; } /* * Create a general RPC client */ int nfs_init_server_rpcclient(struct nfs_server *server, const struct rpc_timeout *timeo, rpc_authflavor_t pseudoflavour) { struct nfs_client *clp = server->nfs_client; server->client = rpc_clone_client_set_auth(clp->cl_rpcclient, pseudoflavour); if (IS_ERR(server->client)) { dprintk("%s: couldn't create rpc_client!\n", __func__); return PTR_ERR(server->client); } memcpy(&server->client->cl_timeout_default, timeo, sizeof(server->client->cl_timeout_default)); server->client->cl_timeout = &server->client->cl_timeout_default; server->client->cl_softrtry = 0; if (server->flags & NFS_MOUNT_SOFTERR) server->client->cl_softerr = 1; if (server->flags & NFS_MOUNT_SOFT) server->client->cl_softrtry = 1; nfs_sysfs_link_rpc_client(server, server->client, NULL); return 0; } EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); /** * nfs_init_client - Initialise an NFS2 or NFS3 client * * @clp: nfs_client to initialise * @cl_init: Initialisation parameters * * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init) { int error; /* the client is already initialised */ if (clp->cl_cons_state == NFS_CS_READY) return clp; /* * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); nfs_mark_client_ready(clp, error == 0 ? NFS_CS_READY : error); if (error < 0) { nfs_put_client(clp); clp = ERR_PTR(error); } return clp; } EXPORT_SYMBOL_GPL(nfs_init_client); /* * Create a version 2 or 3 client */ static int nfs_init_server(struct nfs_server *server, const struct fs_context *fc) { const struct nfs_fs_context *ctx = nfs_fc2context(fc); struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { .hostname = ctx->nfs_server.hostname, .addr = &ctx->nfs_server._address, .addrlen = ctx->nfs_server.addrlen, .nfs_mod = ctx->nfs_mod, .proto = ctx->nfs_server.protocol, .net = fc->net_ns, .timeparms = &timeparms, .cred = server->cred, .nconnect = ctx->nfs_server.nconnect, .init_flags = (1UL << NFS_CS_REUSEPORT), .xprtsec = ctx->xprtsec, }; struct nfs_client *clp; int error; nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol, ctx->timeo, ctx->retrans); if (ctx->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) return PTR_ERR(clp); server->nfs_client = clp; nfs_sysfs_add_server(server); nfs_sysfs_link_rpc_client(server, clp->cl_rpcclient, "_state"); /* Initialise the client representation from the mount data */ server->flags = ctx->flags; server->options = ctx->options; server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; switch (clp->rpc_ops->version) { case 2: server->fattr_valid = NFS_ATTR_FATTR_V2; break; case 3: server->fattr_valid = NFS_ATTR_FATTR_V3; break; default: server->fattr_valid = NFS_ATTR_FATTR_V4; } if (ctx->rsize) server->rsize = nfs_io_size(ctx->rsize, clp->cl_proto); if (ctx->wsize) server->wsize = nfs_io_size(ctx->wsize, clp->cl_proto); server->acregmin = ctx->acregmin * HZ; server->acregmax = ctx->acregmax * HZ; server->acdirmin = ctx->acdirmin * HZ; server->acdirmax = ctx->acdirmax * HZ; /* Start lockd here, before we might error out */ error = nfs_start_lockd(server); if (error < 0) goto error; server->port = ctx->nfs_server.port; server->auth_info = ctx->auth_info; error = nfs_init_server_rpcclient(server, &timeparms, ctx->selected_flavor); if (error < 0) goto error; /* Preserve the values of mount_server-related mount options */ if (ctx->mount_server.addrlen) { memcpy(&server->mountd_address, &ctx->mount_server.address, ctx->mount_server.addrlen); server->mountd_addrlen = ctx->mount_server.addrlen; } server->mountd_version = ctx->mount_server.version; server->mountd_port = ctx->mount_server.port; server->mountd_protocol = ctx->mount_server.protocol; server->namelen = ctx->namlen; return 0; error: server->nfs_client = NULL; nfs_put_client(clp); return error; } /* * Load up the server record from information gained in an fsinfo record */ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) { struct nfs_client *clp = server->nfs_client; unsigned long max_rpc_payload, raw_max_rpc_payload; /* Work out a lot of parameters */ if (server->rsize == 0) server->rsize = nfs_io_size(fsinfo->rtpref, clp->cl_proto); if (server->wsize == 0) server->wsize = nfs_io_size(fsinfo->wtpref, clp->cl_proto); if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax) server->rsize = nfs_io_size(fsinfo->rtmax, clp->cl_proto); if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax) server->wsize = nfs_io_size(fsinfo->wtmax, clp->cl_proto); raw_max_rpc_payload = rpc_max_payload(server->client); max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL); if (server->rsize > max_rpc_payload) server->rsize = max_rpc_payload; if (server->rsize > NFS_MAX_FILE_IO_SIZE) server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT; if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT; server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); if (server->dtsize > NFS_MAX_FILE_IO_SIZE) server->dtsize = NFS_MAX_FILE_IO_SIZE; if (server->dtsize > server->rsize) server->dtsize = server->rsize; if (server->flags & NFS_MOUNT_NOAC) { server->acregmin = server->acregmax = 0; server->acdirmin = server->acdirmax = 0; } server->maxfilesize = fsinfo->maxfilesize; server->time_delta = fsinfo->time_delta; server->change_attr_type = fsinfo->change_attr_type; server->clone_blksize = fsinfo->clone_blksize; /* We're airborne Set socket buffersize */ rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); #ifdef CONFIG_NFS_V4_2 /* * Defaults until limited by the session parameters. */ server->gxasize = min_t(unsigned int, raw_max_rpc_payload, XATTR_SIZE_MAX); server->sxasize = min_t(unsigned int, raw_max_rpc_payload, XATTR_SIZE_MAX); server->lxasize = min_t(unsigned int, raw_max_rpc_payload, nfs42_listxattr_xdrsize(XATTR_LIST_MAX)); if (fsinfo->xattr_support) server->caps |= NFS_CAP_XATTR; #endif } /* * Probe filesystem information, including the FSID on v2/v3 */ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) { struct nfs_fsinfo fsinfo; struct nfs_client *clp = server->nfs_client; int error; if (clp->rpc_ops->set_capabilities != NULL) { error = clp->rpc_ops->set_capabilities(server, mntfh); if (error < 0) return error; } fsinfo.fattr = fattr; fsinfo.nlayouttypes = 0; memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) return error; nfs_server_set_fsinfo(server, &fsinfo); /* Get some general file system info */ if (server->namelen == 0) { struct nfs_pathconf pathinfo; pathinfo.fattr = fattr; nfs_fattr_init(fattr); if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0) server->namelen = pathinfo.max_namelen; } if (clp->rpc_ops->discover_trunking != NULL && (server->caps & NFS_CAP_FS_LOCATIONS && (server->flags & NFS_MOUNT_TRUNK_DISCOVERY))) { error = clp->rpc_ops->discover_trunking(server, mntfh); if (error < 0) return error; } return 0; } /* * Grab the destination's particulars, including lease expiry time. * * Returns zero if probe succeeded and retrieved FSID matches the FSID * we have cached. */ int nfs_probe_server(struct nfs_server *server, struct nfs_fh *mntfh) { struct nfs_fattr *fattr; int error; fattr = nfs_alloc_fattr(); if (fattr == NULL) return -ENOMEM; /* Sanity: the probe won't work if the destination server * does not recognize the migrated FH. */ error = nfs_probe_fsinfo(server, mntfh, fattr); nfs_free_fattr(fattr); return error; } EXPORT_SYMBOL_GPL(nfs_probe_server); /* * Copy useful information when duplicating a server record */ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) { target->flags = source->flags; target->rsize = source->rsize; target->wsize = source->wsize; target->acregmin = source->acregmin; target->acregmax = source->acregmax; target->acdirmin = source->acdirmin; target->acdirmax = source->acdirmax; target->caps = source->caps; target->options = source->options; target->auth_info = source->auth_info; target->port = source->port; } EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); void nfs_server_insert_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); list_add_tail(&server->master_link, &nn->nfs_volume_list); clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); spin_unlock(&nn->nfs_client_lock); } EXPORT_SYMBOL_GPL(nfs_server_insert_lists); void nfs_server_remove_lists(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; struct nfs_net *nn; if (clp == NULL) return; nn = net_generic(clp->cl_net, nfs_net_id); spin_lock(&nn->nfs_client_lock); list_del_rcu(&server->client_link); if (list_empty(&clp->cl_superblocks)) set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); list_del(&server->master_link); spin_unlock(&nn->nfs_client_lock); synchronize_rcu(); } EXPORT_SYMBOL_GPL(nfs_server_remove_lists); static DEFINE_IDA(s_sysfs_ids); /* * Allocate and initialise a server record */ struct nfs_server *nfs_alloc_server(void) { struct nfs_server *server; server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); if (!server) return NULL; server->s_sysfs_id = ida_alloc(&s_sysfs_ids, GFP_KERNEL); if (server->s_sysfs_id < 0) { kfree(server); return NULL; } server->client = server->client_acl = ERR_PTR(-EINVAL); /* Zero out the NFS state stuff */ INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); INIT_LIST_HEAD(&server->delegations); INIT_LIST_HEAD(&server->layouts); INIT_LIST_HEAD(&server->state_owners_lru); INIT_LIST_HEAD(&server->ss_copies); INIT_LIST_HEAD(&server->ss_src_copies); atomic_set(&server->active, 0); server->io_stats = nfs_alloc_iostats(); if (!server->io_stats) { kfree(server); return NULL; } server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; init_waitqueue_head(&server->write_congestion_wait); atomic_long_set(&server->writeback, 0); atomic64_set(&server->owner_ctr, 0); pnfs_init_server(server); rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); return server; } EXPORT_SYMBOL_GPL(nfs_alloc_server); static void delayed_free(struct rcu_head *p) { struct nfs_server *server = container_of(p, struct nfs_server, rcu); nfs_free_iostats(server->io_stats); kfree(server); } /* * Free up a server record */ void nfs_free_server(struct nfs_server *server) { nfs_server_remove_lists(server); if (server->destroy != NULL) server->destroy(server); if (!IS_ERR(server->client_acl)) rpc_shutdown_client(server->client_acl); if (!IS_ERR(server->client)) rpc_shutdown_client(server->client); nfs_put_client(server->nfs_client); if (server->kobj.state_initialized) { nfs_sysfs_remove_server(server); kobject_put(&server->kobj); } ida_free(&s_sysfs_ids, server->s_sysfs_id); put_cred(server->cred); nfs_release_automount_timer(); call_rcu(&server->rcu, delayed_free); } EXPORT_SYMBOL_GPL(nfs_free_server); /* * Create a version 2 or 3 volume record * - keyed on server and FSID */ struct nfs_server *nfs_create_server(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_server *server; struct nfs_fattr *fattr; int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); server->cred = get_cred(fc->cred); error = -ENOMEM; fattr = nfs_alloc_fattr(); if (fattr == NULL) goto error; /* Get a client representation */ error = nfs_init_server(server, fc); if (error < 0) goto error; /* Probe the root fh to retrieve its FSID */ error = nfs_probe_fsinfo(server, ctx->mntfh, fattr); if (error < 0) goto error; if (server->nfs_client->rpc_ops->version == 3) { if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) server->namelen = NFS3_MAXNAMLEN; if (!(ctx->flags & NFS_MOUNT_NORDIRPLUS)) server->caps |= NFS_CAP_READDIRPLUS; } else { if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } if (!(fattr->valid & NFS_ATTR_FATTR)) { error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, fattr, NULL); if (error < 0) { dprintk("nfs_create_server: getattr error = %d\n", -error); goto error; } } memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); dprintk("Server FSID: %llx:%llx\n", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); nfs_server_insert_lists(server); server->mount_time = jiffies; nfs_free_fattr(fattr); return server; error: nfs_free_fattr(fattr); nfs_free_server(server); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_create_server); /* * Clone an NFS2, NFS3 or NFS4 server record */ struct nfs_server *nfs_clone_server(struct nfs_server *source, struct nfs_fh *fh, struct nfs_fattr *fattr, rpc_authflavor_t flavor) { struct nfs_server *server; int error; server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); server->cred = get_cred(source->cred); /* Copy data from the source */ server->nfs_client = source->nfs_client; server->destroy = source->destroy; refcount_inc(&server->nfs_client->cl_count); nfs_server_copy_userdata(server, source); server->fsid = fattr->fsid; nfs_sysfs_add_server(server); nfs_sysfs_link_rpc_client(server, server->nfs_client->cl_rpcclient, "_state"); error = nfs_init_server_rpcclient(server, source->client->cl_timeout, flavor); if (error < 0) goto out_free_server; /* probe the filesystem info for this server filesystem */ error = nfs_probe_server(server, fh); if (error < 0) goto out_free_server; if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; error = nfs_start_lockd(server); if (error < 0) goto out_free_server; nfs_server_insert_lists(server); server->mount_time = jiffies; return server; out_free_server: nfs_free_server(server); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_clone_server); void nfs_clients_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); INIT_LIST_HEAD(&nn->nfs_client_list); INIT_LIST_HEAD(&nn->nfs_volume_list); #if IS_ENABLED(CONFIG_NFS_V4) idr_init(&nn->cb_ident_idr); #endif spin_lock_init(&nn->nfs_client_lock); nn->boot_time = ktime_get_real(); memset(&nn->rpcstats, 0, sizeof(nn->rpcstats)); nn->rpcstats.program = &nfs_program; nfs_netns_sysfs_setup(nn, net); } void nfs_clients_exit(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); nfs_netns_sysfs_destroy(nn); nfs_cleanup_cb_ident_idr(net); WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); } #ifdef CONFIG_PROC_FS static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_server_list_stop(struct seq_file *p, void *v); static int nfs_server_list_show(struct seq_file *m, void *v); static const struct seq_operations nfs_server_list_ops = { .start = nfs_server_list_start, .next = nfs_server_list_next, .stop = nfs_server_list_stop, .show = nfs_server_list_show, }; static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos); static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); static void nfs_volume_list_stop(struct seq_file *p, void *v); static int nfs_volume_list_show(struct seq_file *m, void *v); static const struct seq_operations nfs_volume_list_ops = { .start = nfs_volume_list_start, .next = nfs_volume_list_next, .stop = nfs_volume_list_stop, .show = nfs_volume_list_show, }; /* * set up the iterator to start reading from the server list and return the first item */ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); return seq_list_start_head(&nn->nfs_client_list, *_pos); } /* * move to next server */ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_client_list, pos); } /* * clean up after reading from the transports list */ static void nfs_server_list_stop(struct seq_file *p, void *v) __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } /* * display a header line followed by a load of call lines */ static int nfs_server_list_show(struct seq_file *m, void *v) { struct nfs_client *clp; struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_client_list) { seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); return 0; } /* display one transport per line on subsequent lines */ clp = list_entry(v, struct nfs_client, cl_share_link); /* Check if the client is initialized */ if (clp->cl_cons_state != NFS_CS_READY) return 0; rcu_read_lock(); seq_printf(m, "v%u %s %s %3d %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), refcount_read(&clp->cl_count), clp->cl_hostname); rcu_read_unlock(); return 0; } /* * set up the iterator to start reading from the volume list and return the first item */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* lock the list against modification */ spin_lock(&nn->nfs_client_lock); return seq_list_start_head(&nn->nfs_volume_list, *_pos); } /* * move to next volume */ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); return seq_list_next(v, &nn->nfs_volume_list, pos); } /* * clean up after reading from the transports list */ static void nfs_volume_list_stop(struct seq_file *p, void *v) __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); spin_unlock(&nn->nfs_client_lock); } /* * display a header line followed by a load of call lines */ static int nfs_volume_list_show(struct seq_file *m, void *v) { struct nfs_server *server; struct nfs_client *clp; char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0' char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0' struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_volume_list) { seq_puts(m, "NV SERVER PORT DEV FSID" " FSC\n"); return 0; } /* display one transport per line on subsequent lines */ server = list_entry(v, struct nfs_server, master_link); clp = server->nfs_client; snprintf(dev, sizeof(dev), "%u:%u", MAJOR(server->s_dev), MINOR(server->s_dev)); snprintf(fsid, sizeof(fsid), "%llx:%llx", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); rcu_read_lock(); seq_printf(m, "v%u %s %s %-12s %-33s %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), dev, fsid, nfs_server_fscache_state(server)); rcu_read_unlock(); return 0; } int nfs_fs_proc_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); struct proc_dir_entry *p; nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net); if (!nn->proc_nfsfs) goto error_0; /* a file of servers with which we're dealing */ p = proc_create_net("servers", S_IFREG|S_IRUGO, nn->proc_nfsfs, &nfs_server_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; /* a file of volumes that we have mounted */ p = proc_create_net("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs, &nfs_volume_list_ops, sizeof(struct seq_net_private)); if (!p) goto error_1; return 0; error_1: remove_proc_subtree("nfsfs", net->proc_net); error_0: return -ENOMEM; } void nfs_fs_proc_net_exit(struct net *net) { remove_proc_subtree("nfsfs", net->proc_net); } /* * initialise the /proc/fs/nfsfs/ directory */ int __init nfs_fs_proc_init(void) { if (!proc_mkdir("fs/nfsfs", NULL)) goto error_0; /* a file of servers with which we're dealing */ if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers")) goto error_1; /* a file of volumes that we have mounted */ if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes")) goto error_1; return 0; error_1: remove_proc_subtree("fs/nfsfs", NULL); error_0: return -ENOMEM; } /* * clean up the /proc/fs/nfsfs/ directory */ void nfs_fs_proc_exit(void) { remove_proc_subtree("fs/nfsfs", NULL); ida_destroy(&s_sysfs_ids); } #endif /* CONFIG_PROC_FS */
175 174 166 3 2 167 1 1 168 154 37 35 35 206 3 1 203 201 27 225 304 310 1 10 1 9 10 290 130 214 298 87 309 306 12 77 312 24 306 7 180 187 60 204 203 205 50 5 204 206 1 1 203 2 203 199 203 203 201 88 118 11 3 3 199 200 1 99 197 2 196 194 1 7 1 6 164 161 112 1 52 53 160 82 82 130 127 126 124 18 130 133 18 77 76 100 105 87 85 22 18 18 18 1 18 16 1 18 18 146 117 119 130 159 1 17 17 15 1 118 4 2 1 19 19 17 1 181 107 160 88 6 101 38 41 4 209 1 214 207 9 111 187 105 103 105 97 85 75 98 1 29 16 83 88 77 89 17 105 77 173 175 16 165 86 154 120 17 105 75 1 1 1 1 1 33 32 2 1 1 2 2 138 4 26 1 150 111 36 101 2620 2104 2432 25 205 29 118 2 187 20 79 152 2 30 195 191 11 190 99 147 16 99 142 1 1 204 204 4 200 201 5 5 117 64 67 65 1 3 19 19 17 82 41 4 51 48 18 45 35 26 9 32 2 31 31 30 10 20 31 3 20 21 1 21 21 3 3 3 19 21 26 104 106 103 101 104 105 4 101 7 1 1 1 1 1 15 15 16 5 5 5 5 36 119 8 4 5 6 105 11 115 2 116 10 3 16 3 103 103 2 1 2 84 18 35 68 85 19 19 1 103 97 3 88 12 2 10 98 102 1 29 128 4 116 13 1 1 133 1 126 16 85 17 3 68 38 20 84 105 5 90 10 99 20 85 68 38 5 90 10 118 5 111 12 10 113 10 110 94 25 1 22 119 10 96 10 17 113 105 10 117 108 11 4 1 1 2 2 1 1 5 1 2 2 15 5 10 157 5 15 29 134 122 53 125 150 152 152 4 150 152 5 5 5 5 5 5 22 4 1 1 4 17 5 5 5 4 5 5 1 4 4 4 11 11 11 2 9 12 4 11 11 11 12 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * Some corrections by tytso. */ /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname * lookup logic. */ /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. */ #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/wordpart.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/audit.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/fcntl.h> #include <linux/device_cgroup.h> #include <linux/fs_struct.h> #include <linux/posix_acl.h> #include <linux/hash.h> #include <linux/bitops.h> #include <linux/init_task.h> #include <linux/uaccess.h> #include "internal.h" #include "mount.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) * were necessary because of omirr. The reason is that omirr needs * to know the _real_ pathname, not the user-supplied one, in case * of symlinks (and also when transname replacements occur). * * The new code replaces the old recursive symlink resolution with * an iterative one (in case of non-nested symlink chains). It does * this with calls to <fs>_follow_link(). * As a side effect, dir_namei(), _namei() and follow_link() are now * replaced with a single function lookup_dentry() that can handle all * the special cases of the former code. * * With the new dcache, the pathname is stored at each inode, at least as * long as the refcount of the inode is positive. As a side effect, the * size of the dcache depends on the inode cache and thus is dynamic. * * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink * resolution to correspond with current state of the code. * * Note that the symlink resolution is not *completely* iterative. * There is still a significant amount of tail- and mid- recursion in * the algorithm. Also, note that <fs>_readlink() is not used in * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink() * may return different results than <fs>_follow_link(). Many virtual * filesystems (including /proc) exhibit this behavior. */ /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL * and the name already exists in form of a symlink, try to create the new * name indicated by the symlink. The old code always complained that the * name already exists, due to not following the symlink even if its target * is nonexistent. The new semantics affects also mknod() and link() when * the name is a symlink pointing to a non-existent name. * * I don't know which semantics is the right one, since I have no access * to standards. But I found by trial that HP-UX 9.0 has the full "new" * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the * "old" one. Personally, I think the new semantics is much more logical. * Note that "ln old new" where "new" is a symlink pointing to a non-existing * file does succeed in both HP-UX and SunOs, but not in Solaris * and in the old Linux semantics. */ /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink * semantics. See the comments in "open_namei" and "do_link" below. * * [10-Sep-98 Alan Modra] Another symlink change. */ /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: * inside the path - always follow. * in the last component in creation/removal/renaming - never follow. * if LOOKUP_FOLLOW passed - follow. * if the pathname has trailing slashes - follow. * otherwise - don't follow. * (applied in that order). * * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT * restored for 2.4. This is the last surviving part of old 4.2BSD bug. * During the 2.4 we need to fix the userland stuff depending on it - * hopefully we will be able to get rid of that wart in 2.5. So far only * XEmacs seems to be relying on it... */ /* * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives * any extra contention... */ /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. * * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) struct filename * getname_flags(const char __user *filename, int flags) { struct filename *result; char *kname; int len; result = audit_reusename(filename); if (result) return result; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); /* * First, try to embed the struct filename inside the names_cache * allocation */ kname = (char *)result->iname; result->name = kname; len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); /* * Handle both empty path and copy failure in one go. */ if (unlikely(len <= 0)) { if (unlikely(len < 0)) { __putname(result); return ERR_PTR(len); } /* The empty path is special. */ if (!(flags & LOOKUP_EMPTY)) { __putname(result); return ERR_PTR(-ENOENT); } } /* * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a * separate struct filename so we can dedicate the entire * names_cache allocation for the pathname, and re-do the copy from * userland. */ if (unlikely(len == EMBEDDED_NAME_MAX)) { const size_t size = offsetof(struct filename, iname[1]); kname = (char *)result; /* * size is chosen that way we to guarantee that * result->iname[0] is within the same object and that * kname can't be equal to result->iname, no matter what. */ result = kzalloc(size, GFP_KERNEL); if (unlikely(!result)) { __putname(kname); return ERR_PTR(-ENOMEM); } result->name = kname; len = strncpy_from_user(kname, filename, PATH_MAX); if (unlikely(len < 0)) { __putname(kname); kfree(result); return ERR_PTR(len); } /* The empty path is special. */ if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) { __putname(kname); kfree(result); return ERR_PTR(-ENOENT); } if (unlikely(len == PATH_MAX)) { __putname(kname); kfree(result); return ERR_PTR(-ENAMETOOLONG); } } atomic_set(&result->refcnt, 1); result->uptr = filename; result->aname = NULL; audit_getname(result); return result; } struct filename *getname_uflags(const char __user *filename, int uflags) { int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; return getname_flags(filename, flags); } struct filename *getname(const char __user * filename) { return getname_flags(filename, 0); } struct filename *__getname_maybe_null(const char __user *pathname) { struct filename *name; char c; /* try to save on allocations; loss on um, though */ if (get_user(c, pathname)) return ERR_PTR(-EFAULT); if (!c) return NULL; name = getname_flags(pathname, LOOKUP_EMPTY); if (!IS_ERR(name) && !(name->name[0])) { putname(name); name = NULL; } return name; } struct filename *getname_kernel(const char * filename) { struct filename *result; int len = strlen(filename) + 1; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); if (len <= EMBEDDED_NAME_MAX) { result->name = (char *)result->iname; } else if (len <= PATH_MAX) { const size_t size = offsetof(struct filename, iname[1]); struct filename *tmp; tmp = kmalloc(size, GFP_KERNEL); if (unlikely(!tmp)) { __putname(result); return ERR_PTR(-ENOMEM); } tmp->name = (char *)result; result = tmp; } else { __putname(result); return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; atomic_set(&result->refcnt, 1); audit_getname(result); return result; } EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { if (IS_ERR_OR_NULL(name)) return; if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) return; if (!atomic_dec_and_test(&name->refcnt)) return; if (name->name != name->iname) { __putname(name->name); kfree(name); } else __putname(name); } EXPORT_SYMBOL(putname); /** * check_acl - perform ACL permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the ACL permission checking. Since this function * retrieve POSIX acls it needs to know whether it is called from a blocking or * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *acl; if (mask & MAY_NOT_BLOCK) { acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); if (!acl) return -EAGAIN; /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; return posix_acl_permission(idmap, inode, acl, mask); } acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(idmap, inode, acl, mask); posix_acl_release(acl); return error; } #endif return -EAGAIN; } /* * Very quick optimistic "we know we have no ACL's" check. * * Note that this is purely for ACL_TYPE_ACCESS, and purely * for the "we have cached that there are no ACLs" case. * * If this returns true, we know there are no ACLs. But if * it returns false, we might still not have ACLs (it could * be the is_uncached_acl() case). */ static inline bool no_acl_inode(struct inode *inode) { #ifdef CONFIG_FS_POSIX_ACL return likely(!READ_ONCE(inode->i_acl)); #else return true; #endif } /** * acl_permission_check - perform basic UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the basic UNIX permission checking. Since this * function may retrieve POSIX acls it needs to know whether it is called from a * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; vfsuid_t vfsuid; /* * Common cheap case: everybody has the requested * rights, and there are no ACLs to check. No need * to do any owner/group checks in that case. * * - 'mask&7' is the requested permission bit set * - multiplying by 0111 spreads them out to all of ugo * - '& ~mode' looks for missing inode permission bits * - the '!' is for "no missing permissions" * * After that, we just need to check that there are no * ACL's on the inode - do the 'IS_POSIXACL()' check last * because it will dereference the ->i_sb pointer and we * want to avoid that if at all possible. */ if (!((mask & 7) * 0111 & ~mode)) { if (no_acl_inode(inode)) return 0; if (!IS_POSIXACL(inode)) return 0; } /* Are we the owner? If so, ACL's don't matter */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; return (mask & ~mode) ? -EACCES : 0; } /* Do we have ACL's? */ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { int error = check_acl(idmap, inode, mask); if (error != -EAGAIN) return error; } /* Only RWX matters for group/other mode bits */ mask &= 7; /* * Are the group permissions different from * the other permissions in the bits we care * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } /* Bits in 'mode' clear that we require? */ return (mask & ~mode) ? -EACCES : 0; } /** * generic_permission - check for access rights on a Posix-like filesystem * @idmap: idmap of the mount the inode was found from * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, * %MAY_NOT_BLOCK ...) * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which * are used for other things. * * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; /* * Do the basic permission checks. */ ret = acl_permission_check(idmap, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } /* * Searching includes executable on directories, else just read. */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; /* * Read/write DACs are always overridable. * Executable DACs are overridable when there is * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } EXPORT_SYMBOL(generic_permission); /** * do_inode_permission - UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * We _really_ want to just do "generic_permission()" without * even looking at the inode->i_op values. So we keep a cache * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ static inline int do_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) return inode->i_op->permission(idmap, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } return generic_permission(idmap, inode, mask); } /** * sb_permission - Check superblock-level permissions * @sb: Superblock of inode to check permission on * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { if (unlikely(mask & MAY_WRITE)) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; } return 0; } /** * inode_permission - Check for access rights to a given inode * @idmap: idmap of the mount the inode was found from * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Check for read/write/execute permissions on an inode. We use fs[ug]id for * this, letting us set arbitrary permissions for filesystem access without * changing the "normal" UIDs which are used for other things. * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ int inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int retval; retval = sb_permission(inode->i_sb, inode, mask); if (retval) return retval; if (unlikely(mask & MAY_WRITE)) { /* * Nobody gets write access to an immutable file. */ if (IS_IMMUTABLE(inode)) return -EPERM; /* * Updating mtime will likely cause i_uid and i_gid to be * written back improperly if their true value is unknown * to the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) return -EACCES; } retval = do_inode_permission(idmap, inode, mask); if (retval) return retval; retval = devcgroup_inode_permission(inode, mask); if (retval) return retval; return security_inode_permission(inode, mask); } EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path * @path: path to get the reference to * * Given a path increment the reference count to the dentry and the vfsmount. */ void path_get(const struct path *path) { mntget(path->mnt); dget(path->dentry); } EXPORT_SYMBOL(path_get); /** * path_put - put a reference to a path * @path: path to put the reference to * * Given a path decrement the reference count to the dentry and the vfsmount. */ void path_put(const struct path *path) { dput(path->dentry); mntput(path->mnt); } EXPORT_SYMBOL(path_put); #define EMBEDDED_LEVELS 2 struct nameidata { struct path path; struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags, state; unsigned seq, next_seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; struct saved { struct path link; struct delayed_call done; const char *name; unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; const char *pathname; struct nameidata *saved; unsigned root_seq; int dfd; vfsuid_t dir_vfsuid; umode_t dir_mode; } __randomize_layout; #define ND_ROOT_PRESET 1 #define ND_ROOT_GRABBED 2 #define ND_JUMPED 4 static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) { struct nameidata *old = current->nameidata; p->stack = p->internal; p->depth = 0; p->dfd = dfd; p->name = name; p->pathname = likely(name) ? name->name : ""; p->path.mnt = NULL; p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; current->nameidata = p; } static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name, const struct path *root) { __set_nameidata(p, dfd, name); p->state = 0; if (unlikely(root)) { p->state = ND_ROOT_PRESET; p->root = *root; } } static void restore_nameidata(void) { struct nameidata *now = current->nameidata, *old = now->saved; current->nameidata = old; if (old) old->total_link_count = now->total_link_count; if (now->stack != now->internal) kfree(now->stack); } static bool nd_alloc_stack(struct nameidata *nd) { struct saved *p; p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL); if (unlikely(!p)) return false; memcpy(p, nd->internal, sizeof(nd->internal)); nd->stack = p; return true; } /** * path_connected - Verify that a dentry is below mnt.mnt_root * @mnt: The mountpoint to check. * @dentry: The dentry to check. * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. */ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) { struct super_block *sb = mnt->mnt_sb; /* Bind mounts can have disconnected paths */ if (mnt->mnt_root == sb->s_root) return true; return is_subdir(dentry, mnt->mnt_root); } static void drop_links(struct nameidata *nd) { int i = nd->depth; while (i--) { struct saved *last = nd->stack + i; do_delayed_call(&last->done); clear_delayed_call(&last->done); } } static void leave_rcu(struct nameidata *nd) { nd->flags &= ~LOOKUP_RCU; nd->seq = nd->next_seq = 0; rcu_read_unlock(); } static void terminate_walk(struct nameidata *nd) { drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) { int i; path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); if (nd->state & ND_ROOT_GRABBED) { path_put(&nd->root); nd->state &= ~ND_ROOT_GRABBED; } } else { leave_rcu(nd); } nd->depth = 0; nd->path.mnt = NULL; nd->path.dentry = NULL; } /* path_put is needed afterwards regardless of success or failure */ static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq) { int res = __legitimize_mnt(path->mnt, mseq); if (unlikely(res)) { if (res > 0) path->mnt = NULL; path->dentry = NULL; return false; } if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) { path->dentry = NULL; return false; } return !read_seqcount_retry(&path->dentry->d_seq, seq); } static inline bool legitimize_path(struct nameidata *nd, struct path *path, unsigned seq) { return __legitimize_path(path, seq, nd->m_seq); } static bool legitimize_links(struct nameidata *nd) { int i; if (unlikely(nd->flags & LOOKUP_CACHED)) { drop_links(nd); nd->depth = 0; return false; } for (i = 0; i < nd->depth; i++) { struct saved *last = nd->stack + i; if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { drop_links(nd); nd->depth = i + 1; return false; } } return true; } static bool legitimize_root(struct nameidata *nd) { /* Nothing to do if nd->root is zero or is managed by the VFS user. */ if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; nd->state |= ND_ROOT_GRABBED; return legitimize_path(nd, &nd->root, nd->root_seq); } /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab * normal reference counts on dentries and vfsmounts to transition to ref-walk * mode. Refcounts are grabbed at the last known good point before rcu-walk * got stuck, so ref-walk may continue from there. If this is not successful * (eg. a seqcount has changed), then failure is returned and it's up to caller * to restart the path walk from the beginning in ref-walk mode. */ /** * try_to_unlazy - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * Returns: true on success, false on failure * * try_to_unlazy attempts to legitimize the current nd->path and nd->root * for ref-walk mode. * Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy() failure and * terminate_walk(). */ static bool try_to_unlazy(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; if (unlikely(!legitimize_root(nd))) goto out; leave_rcu(nd); BUG_ON(nd->inode != parent->d_inode); return true; out1: nd->path.mnt = NULL; nd->path.dentry = NULL; out: leave_rcu(nd); return false; } /** * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: next dentry to step into * Returns: true on success, false on failure * * Similar to try_to_unlazy(), but here we have the next dentry already * picked by rcu-walk and want to legitimize that in addition to the current * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out2; res = __legitimize_mnt(nd->path.mnt, nd->m_seq); if (unlikely(res)) { if (res > 0) goto out2; goto out1; } if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; /* * We need to move both the parent and the dentry from the RCU domain * to be properly refcounted. And the sequence number in the dentry * validates *both* dentry counters, since we checked the sequence * number of the parent after we got the child sequence number. So we * know the parent must still be valid if the child sequence number is */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) goto out_dput; /* * Sequence counts matched. Now make sure that the root is * still valid and get it if required. */ if (unlikely(!legitimize_root(nd))) goto out_dput; leave_rcu(nd); return true; out2: nd->path.mnt = NULL; out1: nd->path.dentry = NULL; out: leave_rcu(nd); return false; out_dput: leave_rcu(nd); dput(dentry); return false; } static inline int d_revalidate(struct dentry *dentry, unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) return dentry->d_op->d_revalidate(dentry, flags); else return 1; } /** * complete_walk - successful completion of path walk * @nd: pointer nameidata * * If we had been in RCU mode, drop out of it and legitimize nd->path. * Revalidate the final result, unless we'd already done that during * the path walk or the filesystem doesn't ask for it. Return 0 on * success, -error on failure. In case of failure caller does not * need to drop nd->path. */ static int complete_walk(struct nameidata *nd) { struct dentry *dentry = nd->path.dentry; int status; if (nd->flags & LOOKUP_RCU) { /* * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ if (!(nd->state & ND_ROOT_PRESET)) if (!(nd->flags & LOOKUP_IS_SCOPED)) nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) return -ECHILD; } if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't * ever step outside the root during lookup" and should already * be guaranteed by the rest of namei, we want to avoid a namei * BUG resulting in userspace being given a path that was not * scoped within the root at some point during the lookup. * * So, do a final sanity-check to make sure that in the * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED) * we won't silently return an fd completely outside of the * requested root to userspace. * * Userspace could move the path outside the root after this * check, but as discussed elsewhere this is not a concern (the * resolved file was inside the root at some point). */ if (!path_is_under(&nd->path, &nd->root)) return -EXDEV; } if (likely(!(nd->state & ND_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) return 0; status = dentry->d_op->d_weak_revalidate(dentry, nd->flags); if (status > 0) return 0; if (!status) status = -ESTALE; return status; } static int set_root(struct nameidata *nd) { struct fs_struct *fs = current->fs; /* * Jumping to the real root in a scoped-lookup is a BUG in namei, but we * still have to ensure it doesn't happen because it will cause a breakout * from the dirfd. */ if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED)) return -ENOTRECOVERABLE; if (nd->flags & LOOKUP_RCU) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); nd->state |= ND_ROOT_GRABBED; } return 0; } static int nd_jump_root(struct nameidata *nd) { if (unlikely(nd->flags & LOOKUP_BENEATH)) return -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { /* Absolute path arguments to path_init() are allowed. */ if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt) return -EXDEV; } if (!nd->root.mnt) { int error = set_root(nd); if (error) return error; } if (nd->flags & LOOKUP_RCU) { struct dentry *d; nd->path = nd->root; d = nd->path.dentry; nd->inode = d->d_inode; nd->seq = nd->root_seq; if (read_seqcount_retry(&d->d_seq, nd->seq)) return -ECHILD; } else { path_put(&nd->path); nd->path = nd->root; path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } nd->state |= ND_JUMPED; return 0; } /* * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ int nd_jump_link(const struct path *path) { int error = -ELOOP; struct nameidata *nd = current->nameidata; if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS)) goto err; error = -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { if (nd->path.mnt != path->mnt) goto err; } /* Not currently safe for scoped-lookups. */ if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) goto err; path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->state |= ND_JUMPED; return 0; err: path_put(path); return error; } static inline void put_link(struct nameidata *nd) { struct saved *last = nd->stack + --nd->depth; do_delayed_call(&last->done); if (!(nd->flags & LOOKUP_RCU)) path_put(&last->link); } static int sysctl_protected_symlinks __read_mostly; static int sysctl_protected_hardlinks __read_mostly; static int sysctl_protected_fifos __read_mostly; static int sysctl_protected_regular __read_mostly; #ifdef CONFIG_SYSCTL static struct ctl_table namei_sysctls[] = { { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, }; static int __init init_fs_namei_sysctls(void) { register_sysctl_init("fs", namei_sysctls); return 0; } fs_initcall(init_fs_namei_sysctls); #endif /* CONFIG_SYSCTL */ /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is * in a sticky world-writable directory. This is to protect privileged * processes from failing races against path names that may change out * from under them by way of other users creating malicious symlinks. * It will permit symlinks to be followed only when outside a sticky * world-writable directory, or when the uid of the symlink and follower * match, or when the directory owner matches the symlink's owner. * * Returns 0 if following the symlink is allowed, -ve on error. */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { struct mnt_idmap *idmap; vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; idmap = mnt_idmap(nd->path.mnt); vfsuid = i_uid_into_vfsuid(idmap, inode); /* Allowed if owner and follower match. */ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; /* Allowed if parent directory not sticky and world-writable. */ if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; /* Allowed if parent directory and link owner match. */ if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid)) return 0; if (nd->flags & LOOKUP_RCU) return -ECHILD; audit_inode(nd->name, nd->stack[0].link.dentry, 0); audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link"); return -EACCES; } /** * safe_hardlink_source - Check for safe hardlink conditions * @idmap: idmap of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: * - inode is not a regular file * - inode is setuid * - inode is setgid and group-exec * - access failure for read and write * * Otherwise returns true. */ static bool safe_hardlink_source(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; /* Special files should not get pinned to the filesystem. */ if (!S_ISREG(mode)) return false; /* Setuid files should not get pinned to the filesystem. */ if (mode & S_ISUID) return false; /* Executable setgid files should not get pinned to the filesystem. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE)) return false; return true; } /** * may_linkat - Check permissions for creating a hardlink * @idmap: idmap of the mount the inode was found from * @link: the source to hardlink from * * Block hardlink when all of: * - sysctl_protected_hardlinks enabled * - fsuid does not match inode * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if successful, -ve on error. */ int may_linkat(struct mnt_idmap *idmap, const struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) return 0; /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ if (safe_hardlink_source(idmap, inode) || inode_owner_or_capable(idmap, inode)) return 0; audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); return -EPERM; } /** * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. * @idmap: idmap of the mount the inode was found from * @nd: nameidata pathwalk data * @inode: the inode of the file to open * * Block an O_CREAT open of a FIFO (or a regular file) when: * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled * - the file already exists * - we are in a sticky directory * - we don't own the file * - the owner of the directory doesn't own the file * - the directory is world writable * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2 * the directory doesn't have to be world writable: being group writable will * be enough. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if the open is allowed, -ve on error. */ static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, struct inode *const inode) { umode_t dir_mode = nd->dir_mode; vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid; if (likely(!(dir_mode & S_ISVTX))) return 0; if (S_ISREG(inode->i_mode) && !sysctl_protected_regular) return 0; if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos) return 0; i_vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq(i_vfsuid, dir_vfsuid)) return 0; if (vfsuid_eq_kuid(i_vfsuid, current_fsuid())) return 0; if (likely(dir_mode & 0002)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create"); return -EACCES; } if (dir_mode & 0020) { if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create_fifo"); return -EACCES; } if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create_regular"); return -EACCES; } } return 0; } /* * follow_up - Find the mountpoint of path's vfsmount * * Given a path, find the mountpoint of its source file system. * Replace @path with the path of the mountpoint in the parent mount. * Up is towards /. * * Return 1 if we went up a level and 0 if we were already at the * root. */ int follow_up(struct path *path) { struct mount *mnt = real_mount(path->mnt); struct mount *parent; struct dentry *mountpoint; read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); path->mnt = &parent->mnt; return 1; } EXPORT_SYMBOL(follow_up); static bool choose_mountpoint_rcu(struct mount *m, const struct path *root, struct path *path, unsigned *seqp) { while (mnt_has_parent(m)) { struct dentry *mountpoint = m->mnt_mountpoint; m = m->mnt_parent; if (unlikely(root->dentry == mountpoint && root->mnt == &m->mnt)) break; if (mountpoint != m->mnt.mnt_root) { path->mnt = &m->mnt; path->dentry = mountpoint; *seqp = read_seqcount_begin(&mountpoint->d_seq); return true; } } return false; } static bool choose_mountpoint(struct mount *m, const struct path *root, struct path *path) { bool found; rcu_read_lock(); while (1) { unsigned seq, mseq = read_seqbegin(&mount_lock); found = choose_mountpoint_rcu(m, root, path, &seq); if (unlikely(!found)) { if (!read_seqretry(&mount_lock, mseq)) break; } else { if (likely(__legitimize_path(path, seq, mseq))) break; rcu_read_unlock(); path_put(path); rcu_read_lock(); } } rcu_read_unlock(); return found; } /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ static int follow_automount(struct path *path, int *count, unsigned lookup_flags) { struct dentry *dentry = path->dentry; /* We don't want to mount if someone's just doing a stat - * unless they're stat'ing a directory and appended a '/' to * the name. * * We do, however, want to mount if someone wants to open or * create a file of any type under the mountpoint, wants to * traverse through the mountpoint or wants to open the * mounted directory. Also, autofs may mark negative dentries * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && dentry->d_inode) return -EISDIR; if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; return finish_automount(dentry->d_op->d_automount(path), path); } /* * mount traversal - out-of-line part. One note on ->d_flags accesses - * dentries are pinned but not locked here, so negative dentry can go * positive right under us. Use of smp_load_acquire() provides a barrier * sufficient for ->d_inode and ->d_flags consistency. */ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, int *count, unsigned lookup_flags) { struct vfsmount *mnt = path->mnt; bool need_mntput = false; int ret = 0; while (flags & DCACHE_MANAGED_DENTRY) { /* Allow the filesystem to manage the transit without i_mutex * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (flags & DCACHE_MOUNTED) { // something's mounted on it.. struct vfsmount *mounted = lookup_mnt(path); if (mounted) { // ... in our namespace dput(path->dentry); if (need_mntput) mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); // here we know it's positive flags = path->dentry->d_flags; need_mntput = true; continue; } } if (!(flags & DCACHE_NEED_AUTOMOUNT)) break; // uncovered automount point ret = follow_automount(path, count, lookup_flags); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (ret == -EISDIR) ret = 0; // possible if you race with several mount --move if (need_mntput && path->mnt == mnt) mntput(path->mnt); if (!ret && unlikely(d_flags_negative(flags))) ret = -ENOENT; *jumped = need_mntput; return ret; } static inline int traverse_mounts(struct path *path, bool *jumped, int *count, unsigned lookup_flags) { unsigned flags = smp_load_acquire(&path->dentry->d_flags); /* fastpath */ if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { *jumped = false; if (unlikely(d_flags_negative(flags))) return -ENOENT; return 0; } return __traverse_mounts(path, flags, jumped, count, lookup_flags); } int follow_down_one(struct path *path) { struct vfsmount *mounted; mounted = lookup_mnt(path); if (mounted) { dput(path->dentry); mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); return 1; } return 0; } EXPORT_SYMBOL(follow_down_one); /* * Follow down to the covering mount currently visible to userspace. At each * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. */ int follow_down(struct path *path, unsigned int flags) { struct vfsmount *mnt = path->mnt; bool jumped; int ret = traverse_mounts(path, &jumped, NULL, flags); if (path->mnt != mnt) mntput(mnt); return ret; } EXPORT_SYMBOL(follow_down); /* * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) { struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; if (likely(!(flags & DCACHE_MANAGED_DENTRY))) return true; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return false; for (;;) { /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) { int res = dentry->d_op->d_manage(path, true); if (res) return res == -EISDIR; flags = dentry->d_flags; } if (flags & DCACHE_MOUNTED) { struct mount *mounted = __lookup_mnt(path->mnt, dentry); if (mounted) { path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; nd->state |= ND_JUMPED; nd->next_seq = read_seqcount_begin(&dentry->d_seq); flags = dentry->d_flags; // makes sure that non-RCU pathwalk could reach // this state. if (read_seqretry(&mount_lock, nd->m_seq)) return false; continue; } if (read_seqretry(&mount_lock, nd->m_seq)) return false; } return !(flags & DCACHE_NEED_AUTOMOUNT); } } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, struct path *path) { bool jumped; int ret; path->mnt = nd->path.mnt; path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { unsigned int seq = nd->next_seq; if (likely(__follow_mount_rcu(nd, path))) return 0; // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; nd->next_seq = seq; if (!try_to_unlazy_next(nd, dentry)) return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); if (jumped) { if (unlikely(nd->flags & LOOKUP_NO_XDEV)) ret = -EXDEV; else nd->state |= ND_JUMPED; } if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt); } return ret; } /* * This looks up the name in dcache and possibly revalidates the found dentry. * NULL is returned if the dentry does not exist in the cache. */ static struct dentry *lookup_dcache(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry = d_lookup(dir, name); if (dentry) { int error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (!error) d_invalidate(dentry); dput(dentry); return ERR_PTR(error); } } return dentry; } /* * Parent directory has inode locked exclusive. This is one * and only case when ->lookup() gets called on non in-lookup * dentries - as the matter of fact, this only gets called * when directory is guaranteed to have no in-lookup children * at all. */ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags) { struct dentry *dentry = lookup_dcache(name, base, flags); struct dentry *old; struct inode *dir = base->d_inode; if (dentry) return dentry; /* Don't create child dentry for a dead directory. */ if (unlikely(IS_DEADDIR(dir))) return ERR_PTR(-ENOENT); dentry = d_alloc(base, name); if (unlikely(!dentry)) return ERR_PTR(-ENOMEM); old = dir->i_op->lookup(dir, dentry, flags); if (unlikely(old)) { dput(dentry); dentry = old; } return dentry; } EXPORT_SYMBOL(lookup_one_qstr_excl); /** * lookup_fast - do fast lockless (but racy) lookup of a dentry * @nd: current nameidata * * Do a fast, but racy lookup in the dcache for the given dentry, and * revalidate it. Returns a valid dentry pointer or NULL if one wasn't * found. On error, an ERR_PTR will be returned. * * If this function returns a valid dentry and the walk is no longer * lazy, the dentry will carry a reference that must later be put. If * RCU mode is still in force, then this is not the case and the dentry * must be legitimized before use. If this returns NULL, then the walk * will no longer be in RCU mode. */ static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, the caller is * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); if (unlikely(!dentry)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); return NULL; } /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. */ if (read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); status = d_revalidate(dentry, nd->flags); if (likely(status > 0)) return dentry; if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) return NULL; status = d_revalidate(dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) d_invalidate(dentry); dput(dentry); return ERR_PTR(status); } return dentry; } /* Fast lookup failed, do it the slow way */ static struct dentry *__lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry, *old; struct inode *inode = dir->d_inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) return ERR_PTR(-ENOENT); again: dentry = d_alloc_parallel(dir, name, &wq); if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { int error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (!error) { d_invalidate(dentry); dput(dentry); goto again; } dput(dentry); dentry = ERR_PTR(error); } } else { old = inode->i_op->lookup(inode, dentry, flags); d_lookup_done(dentry); if (unlikely(old)) { dput(dentry); dentry = old; } } return dentry; } static struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct inode *inode = dir->d_inode; struct dentry *res; inode_lock_shared(inode); res = __lookup_slow(name, dir, flags); inode_unlock_shared(inode); return res; } static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *restrict nd) { int err, mask; mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); if (likely(!err)) return 0; // If we failed, and we weren't in LOOKUP_RCU, it's final if (!(nd->flags & LOOKUP_RCU)) return err; // Drop out of RCU mode to make sure it wasn't transient if (!try_to_unlazy(nd)) return -ECHILD; // redo it all non-lazy if (err != -ECHILD) // hard error return err; return inode_permission(idmap, nd->inode, MAY_EXEC); } static int reserve_stack(struct nameidata *nd, struct path *link) { if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; if (likely(nd->depth != EMBEDDED_LEVELS)) return 0; if (likely(nd->stack != nd->internal)) return 0; if (likely(nd_alloc_stack(nd))) return 0; if (nd->flags & LOOKUP_RCU) { // we need to grab link before we do unlazy. And we can't skip // unlazy even if we fail to grab the link - cleanup needs it bool grabbed_link = legitimize_path(nd, link, nd->next_seq); if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; if (nd_alloc_stack(nd)) return 0; } return -ENOMEM; } enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; static const char *pick_link(struct nameidata *nd, struct path *link, struct inode *inode, int flags) { struct saved *last; const char *res; int error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) path_put(link); return ERR_PTR(error); } last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); last->seq = nd->next_seq; if (flags & WALK_TRAILING) { error = may_follow_link(nd, inode); if (unlikely(error)) return ERR_PTR(error); } if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) return ERR_PTR(-ELOOP); if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); } else if (atime_needs_update(&last->link, inode)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); touch_atime(&last->link); } error = security_inode_follow_link(link->dentry, inode, nd->flags & LOOKUP_RCU); if (unlikely(error)) return ERR_PTR(error); res = READ_ONCE(inode->i_link); if (!res) { const char * (*get)(struct dentry *, struct inode *, struct delayed_call *); get = inode->i_op->get_link; if (nd->flags & LOOKUP_RCU) { res = get(NULL, inode, &last->done); if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd)) res = get(link->dentry, inode, &last->done); } else { res = get(link->dentry, inode, &last->done); } if (!res) goto all_done; if (IS_ERR(res)) return res; } if (*res == '/') { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); while (unlikely(*++res == '/')) ; } if (*res) return res; all_done: // pure jump put_link(nd); return NULL; } /* * Do we need to follow links? We _really_ want to be able * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. * * NOTE: dentry must be what nd->next_seq had been sampled from. */ static const char *step_into(struct nameidata *nd, int flags, struct dentry *dentry) { struct path path; struct inode *inode; int err = handle_mounts(nd, dentry, &path); if (err < 0) return ERR_PTR(err); inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ if (nd->flags & LOOKUP_RCU) { if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); if (unlikely(!inode)) return ERR_PTR(-ENOENT); } else { dput(nd->path.dentry); if (nd->path.mnt != path.mnt) mntput(nd->path.mnt); } nd->path = path; nd->inode = inode; nd->seq = nd->next_seq; return NULL; } if (nd->flags & LOOKUP_RCU) { /* make sure that d_is_symlink above matches inode */ if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); } else { if (path.mnt == nd->path.mnt) mntget(path.mnt); } return pick_link(nd, &path, inode, flags); } static struct dentry *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent, *old; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; unsigned seq; if (!choose_mountpoint_rcu(real_mount(nd->path.mnt), &nd->root, &path, &seq)) goto in_root; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-ECHILD); nd->path = path; nd->inode = path.dentry->d_inode; nd->seq = seq; // makes sure that non-RCU pathwalk could reach this state if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ } old = nd->path.dentry; parent = old->d_parent; nd->next_seq = read_seqcount_begin(&parent->d_seq); // makes sure that non-RCU pathwalk could reach this state if (read_seqcount_retry(&old->d_seq, nd->seq)) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) return ERR_PTR(-ECHILD); return parent; in_root: if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); nd->next_seq = nd->seq; return nd->path.dentry; } static struct dentry *follow_dotdot(struct nameidata *nd) { struct dentry *parent; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; if (!choose_mountpoint(real_mount(nd->path.mnt), &nd->root, &path)) goto in_root; path_put(&nd->path); nd->path = path; nd->inode = path.dentry->d_inode; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-EXDEV); } /* rare case of legitimate dget_parent()... */ parent = dget_parent(nd->path.dentry); if (unlikely(!path_connected(nd->path.mnt, parent))) { dput(parent); return ERR_PTR(-ENOENT); } return parent; in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); return dget(nd->path.dentry); } static const char *handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { const char *error = NULL; struct dentry *parent; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); if (error) return error; } if (nd->flags & LOOKUP_RCU) parent = follow_dotdot_rcu(nd); else parent = follow_dotdot(nd); if (IS_ERR(parent)) return ERR_CAST(parent); error = step_into(nd, WALK_NOFOLLOW, parent); if (unlikely(error)) return error; if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * If there was a racing rename or mount along our * path, then we can't be sure that ".." hasn't jumped * above nd->root (and so userspace should retry or use * some fallback). */ smp_rmb(); if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)) return ERR_PTR(-EAGAIN); if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)) return ERR_PTR(-EAGAIN); } } return NULL; } static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (unlikely(!dentry)) { dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); if (IS_ERR(dentry)) return ERR_CAST(dentry); } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return step_into(nd, flags, dentry); } /* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: * * - Architectures with fast unaligned word accesses. We could * do a "get_unaligned()" if this helps and is sufficiently * fast. * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. * * - Furthermore, we need an efficient 64-bit compile for the * 64-bit case in order to generate the "number of bytes in * the final mask". Again, that could be replaced with a * efficient population count instruction or similar. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> #ifdef HASH_MIX /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */ #elif defined(CONFIG_64BIT) /* * Register pressure in the mixing function is an issue, particularly * on 32-bit x86, but almost any function requires one state value and * one temporary. Instead, use a function designed for two state values * and no temporaries. * * This function cannot create a collision in only two iterations, so * we have two iterations to achieve avalanche. In those two iterations, * we have six layers of mixing, which is enough to spread one bit's * influence out to 2^6 = 64 state bits. * * Rotate constants are scored by considering either 64 one-bit input * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the * probability of that delta causing a change to each of the 128 output * bits, using a sample of random initial states. * * The Shannon entropy of the computed probabilities is then summed * to produce a score. Ideally, any input change has a 50% chance of * toggling any given output bit. * * Mixing scores (in bits) for (12,45): * Input delta: 1-bit 2-bit * 1 round: 713.3 42542.6 * 2 rounds: 2753.7 140389.8 * 3 rounds: 5954.1 233458.2 * 4 rounds: 7862.6 256672.2 * Perfect: 8192 258048 * (64*128) (64*63/2 * 128) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol64(x,12),\ x += y, y = rol64(y,45),\ y *= 9 ) /* * Fold two longs into one 32-bit hash value. This must be fast, but * latency isn't quite as critical, as there is a fair bit of additional * work done before the hash value is used. */ static inline unsigned int fold_hash(unsigned long x, unsigned long y) { y ^= x * GOLDEN_RATIO_64; y *= GOLDEN_RATIO_64; return y >> 32; } #else /* 32-bit case */ /* * Mixing scores (in bits) for (7,20): * Input delta: 1-bit 2-bit * 1 round: 330.3 9201.6 * 2 rounds: 1246.4 25475.4 * 3 rounds: 1907.1 31295.1 * 4 rounds: 2042.3 31718.6 * Perfect: 2048 31744 * (32*64) (32*31/2 * 64) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol32(x, 7),\ x += y, y = rol32(y,20),\ y *= 9 ) static inline unsigned int fold_hash(unsigned long x, unsigned long y) { /* Use arch-optimized multiply if one exists */ return __hash_32(y ^ __hash_32(x)); } #endif /* * Return the hash of a string of known length. This is carfully * designed to match hash_name(), which is the more critical function. * In particular, we must end by hashing a final word containing 0..7 * payload bytes, to match the way that hash_name() iterates until it * finds the delimiter after the name. */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long a, x = 0, y = (unsigned long)salt; for (;;) { if (!len) goto done; a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; HASH_MIX(x, y, a); name += sizeof(unsigned long); len -= sizeof(unsigned long); } x ^= a & bytemask_from_count(len); done: return fold_hash(x, y); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long a = 0, x = 0, y = (unsigned long)salt; unsigned long adata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; len = 0; goto inside; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); inside: a = load_unaligned_zeropad(name+len); } while (!has_zero(a, &adata, &constants)); adata = prep_zero_mask(a, adata, &constants); mask = create_zero_mask(adata); x ^= a & zero_bytemask(mask); return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } EXPORT_SYMBOL(hashlen_string); /* * Calculate the length and hash of the path component, and * return the length as the result. */ static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { unsigned long a, b, x, y = (unsigned long)nd->path.dentry; unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; /* * The first iteration is special, because it can result in * '.' and '..' and has no mixing other than the final fold. */ a = load_unaligned_zeropad(name); b = a ^ REPEAT_BYTE('/'); if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) { adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); a &= zero_bytemask(mask); *lastword = a; len = find_zero(mask); nd->last.hash = fold_hash(a, y); nd->last.len = len; return name + len; } len = 0; x = 0; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); a &= zero_bytemask(mask); x ^= a; len += find_zero(mask); *lastword = 0; // Multi-word components cannot be DOT or DOTDOT nd->last.hash = fold_hash(x, y); nd->last.len = len; return name + len; } /* * Note that the 'last' word is always zero-masked, but * was loaded as a possibly big-endian word. */ #ifdef __BIG_ENDIAN #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8)) #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16)) #endif #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ /* Return the hash of a string of known length */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long hash = init_name_hash(salt); while (len--) hash = partial_name_hash((unsigned char)*name++, hash); return end_name_hash(hash); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; while (c) { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } return hashlen_create(end_name_hash(hash), len); } EXPORT_SYMBOL(hashlen_string); /* * We know there's a real path component here of at least * one character. */ static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { unsigned long hash = init_name_hash(nd->path.dentry); unsigned long len = 0, c, last = 0; c = (unsigned char)*name; do { last = (last << 8) + c; len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); // This is reliable for DOT or DOTDOT, since the component // cannot contain NUL characters - top bits being zero means // we cannot have had any other pathnames. *lastword = last; nd->last.hash = end_name_hash(hash); nd->last.len = len; return name + len; } #endif #ifndef LAST_WORD_IS_DOT #define LAST_WORD_IS_DOT 0x2e #define LAST_WORD_IS_DOTDOT 0x2e2e #endif /* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. * * Returns 0 and nd will have valid dentry and mnt on success. * Returns error and drops reference to input namei data on failure. */ static int link_path_walk(const char *name, struct nameidata *nd) { int depth = 0; // depth <= nd->depth int err; nd->last_type = LAST_ROOT; nd->flags |= LOOKUP_PARENT; if (IS_ERR(name)) return PTR_ERR(name); while (*name=='/') name++; if (!*name) { nd->dir_mode = 0; // short-circuit the 'hardening' idiocy return 0; } /* At this point we know we have a real path component. */ for(;;) { struct mnt_idmap *idmap; const char *link; unsigned long lastword; idmap = mnt_idmap(nd->path.mnt); err = may_lookup(idmap, nd); if (err) return err; nd->last.name = name; name = hash_name(nd, name, &lastword); switch(lastword) { case LAST_WORD_IS_DOTDOT: nd->last_type = LAST_DOTDOT; nd->state |= ND_JUMPED; break; case LAST_WORD_IS_DOT: nd->last_type = LAST_DOT; break; default: nd->last_type = LAST_NORM; nd->state &= ~ND_JUMPED; struct dentry *parent = nd->path.dentry; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { err = parent->d_op->d_hash(parent, &nd->last); if (err < 0) return err; } } if (!*name) goto OK; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */ do { name++; } while (unlikely(*name == '/')); if (unlikely(!*name)) { OK: /* pathname or trailing symlink, done */ if (!depth) { nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; } /* last component of nested symlink */ name = nd->stack[--depth].name; link = walk_component(nd, 0); } else { /* not the last component */ link = walk_component(nd, WALK_MORE); } if (unlikely(link)) { if (IS_ERR(link)) return PTR_ERR(link); /* a symlink to follow */ nd->stack[depth++].name = name; name = link; continue; } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return -ECHILD; } return -ENOTDIR; } } } /* must be paired with terminate_walk() */ static const char *path_init(struct nameidata *nd, unsigned flags) { int error; const char *s = nd->pathname; /* LOOKUP_CACHED requires RCU, ask caller to retry */ if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) return ERR_PTR(-EAGAIN); if (!*s) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); else nd->seq = nd->next_seq = 0; nd->flags = flags; nd->state |= ND_JUMPED; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); if (nd->state & ND_ROOT_PRESET) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) return ERR_PTR(-ENOTDIR); nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); nd->root_seq = nd->seq; } else { path_get(&nd->path); } return s; } nd->root.mnt = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); return s; } /* Relative pathname -- get the starting-point it is relative to. */ if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->path = fs->pwd; nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; } } else { /* Caller must check execute permissions on the starting path component */ CLASS(fd_raw, f)(nd->dfd); struct dentry *dentry; if (fd_empty(f)) return ERR_PTR(-EBADF); if (flags & LOOKUP_LINKAT_EMPTY) { if (fd_file(f)->f_cred != current_cred() && !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) return ERR_PTR(-ENOENT); } dentry = fd_file(f)->f_path.dentry; if (*s && unlikely(!d_can_lookup(dentry))) return ERR_PTR(-ENOTDIR); nd->path = fd_file(f)->f_path; if (flags & LOOKUP_RCU) { nd->inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } else { path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } } /* For scoped-lookups we need to set the root to the dirfd as well. */ if (flags & LOOKUP_IS_SCOPED) { nd->root = nd->path; if (flags & LOOKUP_RCU) { nd->root_seq = nd->seq; } else { path_get(&nd->root); nd->state |= ND_ROOT_GRABBED; } } return s; } static inline const char *lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; return walk_component(nd, WALK_TRAILING); } static int handle_lookup_down(struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); nd->next_seq = nd->seq; return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); int err; if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) { err = handle_lookup_down(nd); if (unlikely(err < 0)) s = ERR_PTR(err); } while (!(err = link_path_walk(s, nd)) && (s = lookup_last(nd)) != NULL) ; if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { err = handle_lookup_down(nd); nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please... } if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; if (!err) { *path = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) retval = path_lookupat(&nd, flags, path); if (unlikely(retval == -ESTALE)) retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) audit_inode(name, path->dentry, flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); restore_nameidata(); return retval; } /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { const char *s = path_init(nd, flags); int err = link_path_walk(s, nd); if (!err) err = complete_walk(nd); if (!err) { *parent = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } /* Note: this does not consume "name" */ static int __filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) retval = path_parentat(&nd, flags, parent); if (unlikely(retval == -ESTALE)) retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); if (likely(!retval)) { *last = nd.last; *type = nd.last_type; audit_inode(name, parent->dentry, AUDIT_INODE_PARENT); } restore_nameidata(); return retval; } static int filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type) { return __filename_parentat(dfd, name, flags, parent, last, type, NULL); } /* does lookup, returns the object with parent locked */ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) { struct dentry *d; struct qstr last; int type, error; error = filename_parentat(dfd, name, 0, path, &last, &type); if (error) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) { path_put(path); return ERR_PTR(-EINVAL); } inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, path->dentry, 0); if (IS_ERR(d)) { inode_unlock(path->dentry->d_inode); path_put(path); } return d; } struct dentry *kern_path_locked(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); putname(filename); return res; } struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) { struct filename *filename = getname(name); struct dentry *res = __kern_path_locked(dfd, filename, path); putname(filename); return res; } EXPORT_SYMBOL(user_path_locked_at); int kern_path(const char *name, unsigned int flags, struct path *path) { struct filename *filename = getname_kernel(name); int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(kern_path); /** * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair * @filename: filename structure * @flags: lookup flags * @parent: pointer to struct path to fill * @last: last component * @type: type of the last component * @root: pointer to struct path of the base directory */ int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { return __filename_parentat(AT_FDCWD, filename, flags, parent, last, type, root); } EXPORT_SYMBOL(vfs_path_parent_lookup); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair * @dentry: pointer to dentry of the base directory * @mnt: pointer to vfs mount of the base directory * @name: pointer to file name * @flags: lookup flags * @path: pointer to struct path to fill */ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct path *path) { struct filename *filename; struct path root = {.mnt = mnt, .dentry = dentry}; int ret; filename = getname_kernel(name); /* the first argument of filename_lookup() is ignored with root */ ret = filename_lookup(AT_FDCWD, filename, flags, path, &root); putname(filename); return ret; } EXPORT_SYMBOL(vfs_path_lookup); static int lookup_one_common(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len, struct qstr *this) { this->name = name; this->len = len; this->hash = full_name_hash(base, name, len); if (!len) return -EACCES; if (is_dot_dotdot(name, len)) return -EACCES; while (len--) { unsigned int c = *(const unsigned char *)name++; if (c == '/' || c == '\0') return -EACCES; } /* * See if the low-level filesystem might want * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { int err = base->d_op->d_hash(base, this); if (err < 0) return err; } return inode_permission(idmap, base->d_inode, MAY_EXEC); } /** * try_lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Look up a dentry by name in the dcache, returning NULL if it does not * currently exist. The function does not try to create a dentry. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); return lookup_dcache(&this, base, 0); } EXPORT_SYMBOL(try_lookup_one_len); /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { struct dentry *dentry; struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one_len); /** * lookup_one - filesystem helper to lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *dentry; struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one); /** * lookup_one_unlocked - filesystem helper to lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct qstr this; int err; struct dentry *ret; err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); ret = lookup_dcache(&this, base, 0); if (!ret) ret = lookup_slow(&this, base, 0); return ret; } EXPORT_SYMBOL(lookup_one_unlocked); /** * lookup_one_positive_unlocked - filesystem helper to lookup single * pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns * known positive or ERR_PTR(). This is what most of the users want. * * Note that pinned negative with unlocked parent _can_ become positive at any * time, so callers of lookup_one_unlocked() need to be very careful; pinned * positives have >d_inode stable, so this one avoids such problems. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The helper should be called without i_mutex held. */ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *ret = lookup_one_unlocked(idmap, name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); ret = ERR_PTR(-ENOENT); } return ret; } EXPORT_SYMBOL(lookup_one_positive_unlocked); /** * lookup_one_len_unlocked - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ struct dentry *lookup_one_len_unlocked(const char *name, struct dentry *base, int len) { return lookup_one_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_one_len_unlocked); /* * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT) * on negatives. Returns known positive or ERR_PTR(); that's what * most of the users want. Note that pinned negative with unlocked parent * _can_ become positive at any time, so callers of lookup_one_len_unlocked() * need to be very careful; pinned positives have ->d_inode stable, so * this one avoids such problems. */ struct dentry *lookup_positive_unlocked(const char *name, struct dentry *base, int len) { return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_positive_unlocked); #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { /* Find something mounted on "pts" in the same directory as * the input path. */ struct dentry *parent = dget_parent(path->dentry); struct dentry *child; struct qstr this = QSTR_INIT("pts", 3); if (unlikely(!path_connected(path->mnt, parent))) { dput(parent); return -ENOENT; } dput(path->dentry); path->dentry = parent; child = d_hash_and_lookup(parent, &this); if (IS_ERR_OR_NULL(child)) return -ENOENT; path->dentry = child; dput(parent); follow_down(path, 0); return 0; } #endif int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { struct filename *filename = getname_flags(name, flags); int ret = filename_lookup(dfd, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(user_path_at); int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid)) return 0; if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid)) return 0; return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. * 1. We can't do it if dir is read-only (done in permission()) * 2. We should have write and exec permissions on dir * 3. We can't remove anything from append-only dir * 4. We can't do anything with immutable dir (done in permission()) * 5. If the sticky bit on dir is set we should either * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do antyhing with * links pointing to it. * 7. If the victim has an unknown uid or gid we can't change the inode. * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. * 10. We can't remove a root or mountpoint. * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); int error; if (d_is_negative(victim)) return -ENOENT; BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; if (victim->d_flags & DCACHE_NFSFS_RENAMED) return -EBUSY; return 0; } /* Check whether we can create an object with dentry child in directory * dir. * 1. We can't do it if child already exists (open has special treatment for * this case, but since we are inlined it's OK) * 2. We can't do it if dir is read-only (done in permission()) * 3. We can't do it if the fs can't represent the fsuid or fsgid. * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ static inline int may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } // p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) { struct dentry *p = p1, *q = p2, *r; while ((r = p->d_parent) != p2 && r != p) p = r; if (r == p2) { // p is a child of p2 and an ancestor of p1 or p1 itself inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2); return p; } // p is the root of connected component that contains p1 // p2 does not occur on the path from p to p1 while ((r = q->d_parent) != p1 && r != p && r != q) q = r; if (r == p1) { // q is a child of p1 and an ancestor of p2 or p2 itself inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return q; } else if (likely(r == p)) { // both p2 and p1 are descendents of p inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } else { // no common ancestor at the time we'd been called mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); return ERR_PTR(-EXDEV); } } /* * p1 and p2 should be directories on the same fs. */ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) { if (p1 == p2) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } mutex_lock(&p1->d_sb->s_vfs_rename_mutex); return lock_two_directories(p1, p2); } EXPORT_SYMBOL(lock_rename); /* * c1 and p2 should be on the same fs. */ struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) { if (READ_ONCE(c1->d_parent) == p2) { /* * hopefully won't need to touch ->s_vfs_rename_mutex at all. */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); /* * now that p2 is locked, nobody can move in or out of it, * so the test below is safe. */ if (likely(c1->d_parent == p2)) return NULL; /* * c1 got moved out of p2 while we'd been taking locks; * unlock and fall back to slow case. */ inode_unlock(p2->d_inode); } mutex_lock(&c1->d_sb->s_vfs_rename_mutex); /* * nobody can move out of any directories on this fs. */ if (likely(c1->d_parent != p2)) return lock_two_directories(c1->d_parent, p2); /* * c1 got moved into p2 while we were taking locks; * we need p2 locked and ->s_vfs_rename_mutex unlocked, * for consistency with lock_rename(). */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); return NULL; } EXPORT_SYMBOL(lock_rename_child); void unlock_rename(struct dentry *p1, struct dentry *p2) { inode_unlock(p1->d_inode); if (p1 != p2) { inode_unlock(p2->d_inode); mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); } } EXPORT_SYMBOL(unlock_rename); /** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode * @mode: mode of the new inode * @mask_perms: allowed permission by the vfs * @type: type of file to be created * * This helper consolidates and enforces vfs restrictions on the @mode of a new * object to be created. * * Umask stripping depends on whether the filesystem supports POSIX ACLs (see * the kernel documentation for mode_strip_umask()). Moving umask stripping * after setgid stripping allows the same ordering for both non-POSIX ACL and * POSIX ACL supporting filesystems. * * Note that it's currently valid for @type to be 0 if a directory is created. * Filesystems raise that flag individually and we need to check whether each * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a * non-zero type. * * Returns: mode to be passed to the filesystem */ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode, umode_t mask_perms, umode_t type) { mode = mode_strip_sgid(idmap, dir, mode); mode = mode_strip_umask(dir, mode); /* * Apply the vfs mandated allowed permission mask and set the type of * file to be created before we call into the filesystem. */ mode &= (mask_perms & ~S_IFMT); mode |= (type & S_IFMT); return mode; } /** * vfs_create - create new file * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child file * @mode: mode of the child file * @want_excl: whether the file must not yet exist * * Create a new file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) { int error; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); if (error) return error; error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_create); int vfs_mkobj(struct dentry *dentry, umode_t mode, int (*f)(struct dentry *, umode_t, void *), void *arg) { struct inode *dir = dentry->d_parent->d_inode; int error = may_create(&nop_mnt_idmap, dir, dentry); if (error) return error; mode &= S_IALLUGO; mode |= S_IFREG; error = security_inode_create(dir, dentry, mode); if (error) return error; error = f(dentry, mode, arg); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mkobj); bool may_open_dev(const struct path *path) { return !(path->mnt->mnt_flags & MNT_NODEV) && !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } static int may_open(struct mnt_idmap *idmap, const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; if (!inode) return -ENOENT; switch (inode->i_mode & S_IFMT) { case S_IFLNK: return -ELOOP; case S_IFDIR: if (acc_mode & MAY_WRITE) return -EISDIR; if (acc_mode & MAY_EXEC) return -EACCES; break; case S_IFBLK: case S_IFCHR: if (!may_open_dev(path)) return -EACCES; fallthrough; case S_IFIFO: case S_IFSOCK: if (acc_mode & MAY_EXEC) return -EACCES; flag &= ~O_TRUNC; break; case S_IFREG: if ((acc_mode & MAY_EXEC) && path_noexec(path)) return -EACCES; break; } error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); if (error) return error; /* * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) return -EPERM; if (flag & O_TRUNC) return -EPERM; } /* O_NOATIME can only be set by the owner or superuser */ if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode)) return -EPERM; return 0; } static int handle_truncate(struct mnt_idmap *idmap, struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; int error = get_write_access(inode); if (error) return error; error = security_file_truncate(filp); if (!error) { error = do_truncate(idmap, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } put_write_access(inode); return error; } static inline int open_to_namei_flags(int flag) { if ((flag & O_ACCMODE) == 3) flag--; return flag; } static int may_o_create(struct mnt_idmap *idmap, const struct path *dir, struct dentry *dentry, umode_t mode) { int error = security_path_mknod(dir, dentry, mode, 0); if (error) return error; if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap)) return -EOVERFLOW; error = inode_permission(idmap, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; return security_inode_create(dir->dentry->d_inode, dentry, mode); } /* * Attempt to atomically look up, create and open a file from a negative * dentry. * * Returns 0 if successful. The file will have been created and attached to * @file by the filesystem calling finish_open(). * * If the file was looked up only or didn't need creating, FMODE_OPENED won't * be set. The caller will need to perform the open themselves. @path will * have been updated to point to the new dentry. This may be negative. * * Returns an error code otherwise. */ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, struct file *file, int open_flag, umode_t mode) { struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; int error; if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; file->f_path.dentry = DENTRY_NOT_SET; file->f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); if (!error) { if (file->f_mode & FMODE_OPENED) { if (unlikely(dentry != file->f_path.dentry)) { dput(dentry); dentry = dget(file->f_path.dentry); } } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; } else { if (file->f_path.dentry) { dput(dentry); dentry = file->f_path.dentry; } if (unlikely(d_is_negative(dentry))) error = -ENOENT; } } if (error) { dput(dentry); dentry = ERR_PTR(error); } return dentry; } /* * Look up and maybe create and open the last component. * * Must be called with parent locked (exclusive in O_CREAT case). * * Returns 0 on success, that is, if * the file was successfully atomically created (if necessary) and opened, or * the file was not completely opened at this time, though lookups and * creations were performed. * These case are distinguished by presence of FMODE_OPENED on file->f_mode. * In the latter case dentry returned in @path might be negative if O_CREAT * hadn't been specified. * * An error code is returned on failure. */ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, bool got_write) { struct mnt_idmap *idmap; struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); file->f_mode &= ~FMODE_CREATED; dentry = d_lookup(dir, &nd->last); for (;;) { if (!dentry) { dentry = d_alloc_parallel(dir, &nd->last, &wq); if (IS_ERR(dentry)) return dentry; } if (d_in_lookup(dentry)) break; error = d_revalidate(dentry, nd->flags); if (likely(error > 0)) break; if (error) goto out_dput; d_invalidate(dentry); dput(dentry); dentry = NULL; } if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ return dentry; } if (open_flag & O_CREAT) audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the * file exists. But checking existence breaks atomicity. The trick is * to check access and if not granted clear O_CREAT from the flags. * * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ if (unlikely(!got_write)) open_flag &= ~O_TRUNC; idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode); if (likely(got_write)) create_error = may_o_create(idmap, &nd->path, dentry, mode); else create_error = -EROFS; } if (create_error) open_flag &= ~O_CREAT; if (dir_inode->i_op->atomic_open) { dentry = atomic_open(nd, dentry, file, open_flag, mode); if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) dentry = ERR_PTR(create_error); return dentry; } if (d_in_lookup(dentry)) { struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags); d_lookup_done(dentry); if (unlikely(res)) { if (IS_ERR(res)) { error = PTR_ERR(res); goto out_dput; } dput(dentry); dentry = res; } } /* Negative dentry, just create the file */ if (!dentry->d_inode && (open_flag & O_CREAT)) { file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); if (!dir_inode->i_op->create) { error = -EACCES; goto out_dput; } error = dir_inode->i_op->create(idmap, dir_inode, dentry, mode, open_flag & O_EXCL); if (error) goto out_dput; } if (unlikely(create_error) && !dentry->d_inode) { error = create_error; goto out_dput; } return dentry; out_dput: dput(dentry); return ERR_PTR(error); } static inline bool trailing_slashes(struct nameidata *nd) { return (bool)nd->last.name[nd->last.len]; } static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) { struct dentry *dentry; if (open_flag & O_CREAT) { if (trailing_slashes(nd)) return ERR_PTR(-EISDIR); /* Don't bother on an O_EXCL create */ if (open_flag & O_EXCL) return NULL; } if (trailing_slashes(nd)) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; dentry = lookup_fast(nd); if (IS_ERR_OR_NULL(dentry)) return dentry; if (open_flag & O_CREAT) { /* Discard negative dentries. Need inode_lock to do the create */ if (!dentry->d_inode) { if (!(nd->flags & LOOKUP_RCU)) dput(dentry); dentry = NULL; } } return dentry; } static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; struct dentry *dentry; const char *res; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { if (nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } /* We _can_ be in RCU mode here */ dentry = lookup_fast_for_open(nd, open_flag); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (likely(dentry)) goto finish_lookup; if (!(open_flag & O_CREAT)) { if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) return ERR_PTR(-ECHILD); } else { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); } } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { got_write = !mnt_want_write(nd->path.mnt); /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be * dropping this one anyway. */ } if (open_flag & O_CREAT) inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); dentry = lookup_open(nd, file, op, got_write); if (!IS_ERR(dentry)) { if (file->f_mode & FMODE_CREATED) fsnotify_create(dir->d_inode, dentry); if (file->f_mode & FMODE_OPENED) fsnotify_open(file); } if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); if (got_write) mnt_drop_write(nd->path.mnt); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { dput(nd->path.dentry); nd->path.dentry = dentry; return NULL; } finish_lookup: if (nd->depth) put_link(nd); res = step_into(nd, WALK_TRAILING, dentry); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res; } /* * Handle the last step of open() */ static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct mnt_idmap *idmap; int open_flag = op->open_flag; bool do_truncate; int acc_mode; int error; if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) { error = complete_walk(nd); if (error) return error; } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; error = may_create_in_sticky(idmap, nd, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return error; } if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) return -ENOTDIR; do_truncate = false; acc_mode = op->acc_mode; if (file->f_mode & FMODE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; acc_mode = 0; } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { error = mnt_want_write(nd->path.mnt); if (error) return error; do_truncate = true; } error = may_open(idmap, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) error = security_file_post_open(file, op->acc_mode); if (!error && do_truncate) error = handle_truncate(idmap, file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; } if (do_truncate) mnt_drop_write(nd->path.mnt); return error; } /** * vfs_tmpfile - create tmpfile * @idmap: idmap of the mount the inode was found from * @parentpath: pointer to the path of the base directory * @file: file descriptor of the new tmpfile * @mode: mode of the new tmpfile * * Create a temporary file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode) { struct dentry *child; struct inode *dir = d_inode(parentpath->dentry); struct inode *inode; int error; int open_flag = file->f_flags; /* we want directory to be writable */ error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (!dir->i_op->tmpfile) return -EOPNOTSUPP; child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) return -ENOMEM; file->f_path.mnt = parentpath->mnt; file->f_path.dentry = child; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); if (file->f_mode & FMODE_OPENED) fsnotify_open(file); if (error) return error; /* Don't check for other permissions, the inode was just created */ error = may_open(idmap, &file->f_path, 0, file->f_flags); if (error) return error; inode = file_inode(file); if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } security_inode_post_create_tmpfile(idmap, inode); return 0; } /** * kernel_tmpfile_open - open a tmpfile for kernel internal use * @idmap: idmap of the mount the inode was found from * @parentpath: path of the base directory * @mode: mode of the new tmpfile * @open_flag: flags * @cred: credentials for open * * Create and open a temporary file. The file is not accounted in nr_files, * hence this is only for kernel internal use, and must not be installed into * file tables or such. */ struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred) { struct file *file; int error; file = alloc_empty_file_noaccount(open_flag, cred); if (IS_ERR(file)) return file; error = vfs_tmpfile(idmap, parentpath, file, mode); if (error) { fput(file); file = ERR_PTR(error); } return file; } EXPORT_SYMBOL(kernel_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); if (unlikely(error)) return error; error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode); if (error) goto out2; audit_inode(nd->name, file->f_path.dentry, 0); out2: mnt_drop_write(path.mnt); out: path_put(&path); return error; } static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file) { struct path path; int error = path_lookupat(nd, flags, &path); if (!error) { audit_inode(nd->name, path.dentry, 0); error = vfs_open(&path, file); path_put(&path); } return error; } static struct file *path_openat(struct nameidata *nd, const struct open_flags *op, unsigned flags) { struct file *file; int error; file = alloc_empty_file(op->open_flag, current_cred()); if (IS_ERR(file)) return file; if (unlikely(file->f_flags & __O_TMPFILE)) { error = do_tmpfile(nd, flags, op, file); } else if (unlikely(file->f_flags & O_PATH)) { error = do_o_path(nd, flags, file); } else { const char *s = path_init(nd, flags); while (!(error = link_path_walk(s, nd)) && (s = open_last_lookups(nd, file, op)) != NULL) ; if (!error) error = do_open(nd, file, op); terminate_walk(nd); } if (likely(!error)) { if (likely(file->f_mode & FMODE_OPENED)) return file; WARN_ON(1); error = -EINVAL; } fput(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; else error = -ESTALE; } return ERR_PTR(error); } struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op) { struct nameidata nd; int flags = op->lookup_flags; struct file *filp; set_nameidata(&nd, dfd, pathname, NULL); filp = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); return filp; } struct file *do_file_open_root(const struct path *root, const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; struct filename *filename; int flags = op->lookup_flags; if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); filename = getname_kernel(name); if (IS_ERR(filename)) return ERR_CAST(filename); set_nameidata(&nd, -1, filename, root); file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) file = path_openat(&nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) file = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); putname(filename); return file; } static struct dentry *filename_create(int dfd, struct filename *name, struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; bool want_dir = lookup_flags & LOOKUP_DIRECTORY; unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; int err2; int error; error = filename_parentat(dfd, name, reval_flag, path, &last, &type); if (error) return ERR_PTR(error); /* * Yucky last component or no last component at all? * (foo/., foo/.., /////) */ if (unlikely(type != LAST_NORM)) goto out; /* don't fail immediately if it's r/o, at least try to report other errors */ err2 = mnt_want_write(path->mnt); /* * Do the final lookup. Suppress 'create' if there is a trailing * '/', and a directory wasn't requested. */ if (last.name[last.len] && !want_dir) create_flags = 0; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; error = -EEXIST; if (d_is_positive(dentry)) goto fail; /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */ if (unlikely(!create_flags)) { error = -ENOENT; goto fail; } if (unlikely(err2)) { error = err2; goto fail; } return dentry; fail: dput(dentry); dentry = ERR_PTR(error); unlock: inode_unlock(path->dentry->d_inode); if (!err2) mnt_drop_write(path->mnt); out: path_put(path); return dentry; } struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); putname(filename); return res; } EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { dput(dentry); inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); } EXPORT_SYMBOL(done_path_create); inline struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); putname(filename); return res; } EXPORT_SYMBOL(user_path_create); /** * vfs_mknod - create device node or file * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child device node * @mode: mode of the child device node * @dev: device number of device to create * * Create a device node or file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(idmap, dir, dentry); if (error) return error; if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) return -EPERM; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = devcgroup_inode_mknod(mode, dev); if (error) return error; error = security_inode_mknod(dir, dentry, mode, dev); if (error) return error; error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: case 0: /* zero mode translates to S_IFREG */ return 0; case S_IFDIR: return -EPERM; default: return -EINVAL; } } static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { struct mnt_idmap *idmap; struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = 0; error = may_mknod(mode); if (error) goto out1; retry: dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out1; error = security_path_mknod(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode), dev); if (error) goto out2; idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(idmap, path.dentry->d_inode, dentry, mode, true); if (!error) security_path_post_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, 0); break; } out2: done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out1: putname(name); return error; } SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, dev) { return do_mknodat(dfd, getname(filename), mode, dev); } SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { return do_mknodat(AT_FDCWD, getname(filename), mode, dev); } /** * vfs_mkdir - create directory * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child directory * @mode: mode of the child directory * * Create a directory. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int error; unsigned max_links = dir->i_sb->s_max_links; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->mkdir) return -EPERM; mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) return error; if (max_links && dir->i_nlink >= max_links) return -EMLINK; error = dir->i_op->mkdir(idmap, dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mkdir); int do_mkdirat(int dfd, struct filename *name, umode_t mode) { struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = LOOKUP_DIRECTORY; retry: dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putname; error = security_path_mkdir(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out_putname: putname(name); return error; } SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { return do_mkdirat(dfd, getname(pathname), mode); } SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { return do_mkdirat(AT_FDCWD, getname(pathname), mode); } /** * vfs_rmdir - remove directory * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child directory * * Remove a directory. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { int error = may_delete(idmap, dir, dentry, 1); if (error) return error; if (!dir->i_op->rmdir) return -EPERM; dget(dentry); inode_lock(dentry->d_inode); error = -EBUSY; if (is_local_mountpoint(dentry) || (dentry->d_inode->i_flags & S_KERNEL_FILE)) goto out; error = security_inode_rmdir(dir, dentry); if (error) goto out; error = dir->i_op->rmdir(dir, dentry); if (error) goto out; shrink_dcache_parent(dentry); dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); out: inode_unlock(dentry->d_inode); dput(dentry); if (!error) d_delete_notify(dir, dentry); return error; } EXPORT_SYMBOL(vfs_rmdir); int do_rmdir(int dfd, struct filename *name) { int error; struct dentry *dentry; struct path path; struct qstr last; int type; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; switch (type) { case LAST_DOTDOT: error = -ENOTEMPTY; goto exit2; case LAST_DOT: error = -EINVAL; goto exit2; case LAST_ROOT: error = -EBUSY; goto exit2; } error = mnt_want_write(path.mnt); if (error) goto exit2; inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; if (!dentry->d_inode) { error = -ENOENT; goto exit4; } error = security_path_rmdir(&path, dentry); if (error) goto exit4; error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); exit4: dput(dentry); exit3: inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } exit1: putname(name); return error; } SYSCALL_DEFINE1(rmdir, const char __user *, pathname) { return do_rmdir(AT_FDCWD, getname(pathname)); } /** * vfs_unlink - unlink a filesystem object * @idmap: idmap of the mount the inode was found from * @dir: parent directory * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. * * The caller must hold dir->i_mutex. * * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and * return a reference to the inode in delegated_inode. The caller * should then break the delegation on that inode and retry. Because * breaking a delegation may take a long time, the caller should drop * dir->i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; int error = may_delete(idmap, dir, dentry, 0); if (error) return error; if (!dir->i_op->unlink) return -EPERM; inode_lock(target); if (IS_SWAPFILE(target)) error = -EPERM; else if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); if (!error) { error = try_break_deleg(target, delegated_inode); if (error) goto out; error = dir->i_op->unlink(dir, dentry); if (!error) { dont_mount(dentry); detach_mounts(dentry); } } } out: inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) { fsnotify_unlink(dir, dentry); } else if (!error) { fsnotify_link_count(target); d_delete_notify(dir, dentry); } return error; } EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its * directory's i_mutex. Truncate can take a long time if there is a lot of * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ int do_unlinkat(int dfd, struct filename *name) { int error; struct dentry *dentry; struct path path; struct qstr last; int type; struct inode *inode = NULL; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; error = -EISDIR; if (type != LAST_NORM) goto exit2; error = mnt_want_write(path.mnt); if (error) goto exit2; retry_deleg: inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ if (last.name[last.len] || d_is_negative(dentry)) goto slashes; inode = dentry->d_inode; ihold(inode); error = security_path_unlink(&path, dentry); if (error) goto exit3; error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, &delegated_inode); exit3: dput(dentry); } inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path.mnt); exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; inode = NULL; goto retry; } exit1: putname(name); return error; slashes: if (d_is_negative(dentry)) error = -ENOENT; else if (d_is_dir(dentry)) error = -EISDIR; else error = -ENOTDIR; goto exit3; } SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) { if ((flag & ~AT_REMOVEDIR) != 0) return -EINVAL; if (flag & AT_REMOVEDIR) return do_rmdir(dfd, getname(pathname)); return do_unlinkat(dfd, getname(pathname)); } SYSCALL_DEFINE1(unlink, const char __user *, pathname) { return do_unlinkat(AT_FDCWD, getname(pathname)); } /** * vfs_symlink - create symlink * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child symlink file * @oldname: name of the file to link to * * Create a symlink. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *oldname) { int error; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->symlink) return -EPERM; error = security_inode_symlink(dir, dentry, oldname); if (error) return error; error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_symlink); int do_symlinkat(struct filename *from, int newdfd, struct filename *to) { int error; struct dentry *dentry; struct path path; unsigned int lookup_flags = 0; if (IS_ERR(from)) { error = PTR_ERR(from); goto out_putnames; } retry: dentry = filename_create(newdfd, to, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putnames; error = security_path_symlink(&path, dentry, from->name); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, from->name); done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out_putnames: putname(to); putname(from); return error; } SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_symlinkat(getname(oldname), newdfd, getname(newname)); } SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) { return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname)); } /** * vfs_link - create a new link * @old_dentry: object to be linked * @idmap: idmap of the mount * @dir: new parent * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break * * The caller must hold dir->i_mutex * * If vfs_link discovers a delegation on the to-be-linked file in need * of breaking, it will return -EWOULDBLOCK and return a reference to the * inode in delegated_inode. The caller should then break the delegation * and retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; int error; if (!inode) return -ENOENT; error = may_create(idmap, dir, new_dentry); if (error) return error; if (dir->i_sb != inode->i_sb) return -EXDEV; /* * A link to an append-only or immutable file cannot be created. */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; /* * Updating the link count will likely cause i_uid and i_gid to * be writen back improperly if their true value is unknown to * the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) return -EPERM; error = security_inode_link(old_dentry, dir, new_dentry); if (error) return error; inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; else { error = try_break_deleg(inode, delegated_inode); if (!error) error = dir->i_op->link(old_dentry, dir, new_dentry); } if (!error && (inode->i_state & I_LINKABLE)) { spin_lock(&inode->i_lock); inode->i_state &= ~I_LINKABLE; spin_unlock(&inode->i_lock); } inode_unlock(inode); if (!error) fsnotify_link(dir, inode, new_dentry); return error; } EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid * security-related surprises by not following symlinks on the * newname. --KAB * * We don't follow them on the oldname either to be compatible * with linux 2.0, and to avoid hard-linking to directories * and other special files. --ADM */ int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags) { struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; struct inode *delegated_inode = NULL; int how = 0; int error; if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) { error = -EINVAL; goto out_putnames; } /* * To use null names we require CAP_DAC_READ_SEARCH or * that the open-time creds of the dfd matches current. * This ensures that not everyone will be able to create * a hardlink using the passed file descriptor. */ if (flags & AT_EMPTY_PATH) how |= LOOKUP_LINKAT_EMPTY; if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; retry: error = filename_lookup(olddfd, old, how, &old_path, NULL); if (error) goto out_putnames; new_dentry = filename_create(newdfd, new, &new_path, (how & LOOKUP_REVAL)); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out_putpath; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; idmap = mnt_idmap(new_path.mnt); error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { path_put(&old_path); goto retry; } } if (retry_estale(error, how)) { path_put(&old_path); how |= LOOKUP_REVAL; goto retry; } out_putpath: path_put(&old_path); out_putnames: putname(old); putname(new); return error; } SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, int, flags) { return do_linkat(olddfd, getname_uflags(oldname, flags), newdfd, getname(newname), flags); } SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) { return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } /** * vfs_rename - rename a filesystem object * @rd: pointer to &struct renamedata info * * The caller must hold multiple mutexes--see lock_rename()). * * If vfs_rename discovers a delegation in need of breaking at either * the source or destination, it will return -EWOULDBLOCK and return a * reference to the inode in delegated_inode. The caller should then * break the delegation and retry. Because breaking a delegation may * take a long time, the caller should drop all locks before doing * so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: * * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4BSD screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. * c) we may have to lock up to _four_ objects - parents and victim (if it exists), * and source (if it's a non-directory or a subdirectory that moves to * different parent). * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change * only under ->s_vfs_rename_mutex _and_ that parent of the object we * move will be locked. Thus we can rank directories by the tree * (ancestors first) and rank all non-directories after them. * That works since everybody except rename does "lock parent, lookup, * lock child" and rename is under ->s_vfs_rename_mutex. * HOWEVER, it relies on the assumption that any object with ->lookup() * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. * d) conversion from fhandle to dentry may come in the wrong moment - when * we are removing the target. Solution: we will have to grab ->i_mutex * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ int vfs_rename(struct renamedata *rd) { int error; struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir; struct dentry *old_dentry = rd->old_dentry; struct dentry *new_dentry = rd->new_dentry; struct inode **delegated_inode = rd->delegated_inode; unsigned int flags = rd->flags; bool is_dir = d_is_dir(old_dentry); struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; struct name_snapshot old_name; bool lock_old_subdir, lock_new_subdir; if (source == target) return 0; error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, is_dir); else error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) return error; if (!old_dir->i_op->rename) return -EPERM; /* * If we are going to change the parent - check write permissions, * we'll need to flip '..'. */ if (new_dir != old_dir) { if (is_dir) { error = inode_permission(rd->old_mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { error = inode_permission(rd->new_mnt_idmap, target, MAY_WRITE); if (error) return error; } } error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (error) return error; take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); /* * Lock children. * The source subdirectory needs to be locked on cross-directory * rename or cross-directory exchange since its parent changes. * The target subdirectory needs to be locked on cross-directory * exchange due to parent change and on any rename due to becoming * a victim. * Non-directories need locking in all cases (for NFS reasons); * they get locked after any subdirectories (in inode address order). * * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE. * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex. */ lock_old_subdir = new_dir != old_dir; lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE); if (is_dir) { if (lock_old_subdir) inode_lock_nested(source, I_MUTEX_CHILD); if (target && (!new_is_dir || lock_new_subdir)) inode_lock(target); } else if (new_is_dir) { if (lock_new_subdir) inode_lock_nested(target, I_MUTEX_CHILD); inode_lock(source); } else { lock_two_nondirectories(source, target); } error = -EPERM; if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) goto out; error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; if (max_links && new_dir != old_dir) { error = -EMLINK; if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) goto out; if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && old_dir->i_nlink >= max_links) goto out; } if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) goto out; } if (target && !new_is_dir) { error = try_break_deleg(target, delegated_inode); if (error) goto out; } error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; if (!(flags & RENAME_EXCHANGE) && target) { if (is_dir) { shrink_dcache_parent(new_dentry); target->i_flags |= S_DEAD; } dont_mount(new_dentry); detach_mounts(new_dentry); } if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { if (!(flags & RENAME_EXCHANGE)) d_move(old_dentry, new_dentry); else d_exchange(old_dentry, new_dentry); } out: if (!is_dir || lock_old_subdir) inode_unlock(source); if (target && (!new_is_dir || lock_new_subdir)) inode_unlock(target); dput(new_dentry); if (!error) { fsnotify_move(old_dir, new_dir, &old_name.name, is_dir, !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); if (flags & RENAME_EXCHANGE) { fsnotify_move(new_dir, old_dir, &old_dentry->d_name, new_is_dir, NULL, new_dentry); } } release_dentry_name_snapshot(&old_name); return error; } EXPORT_SYMBOL(vfs_rename); int do_renameat2(int olddfd, struct filename *from, int newdfd, struct filename *to, unsigned int flags) { struct renamedata rd; struct dentry *old_dentry, *new_dentry; struct dentry *trap; struct path old_path, new_path; struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; bool should_retry = false; int error = -EINVAL; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) goto put_names; if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && (flags & RENAME_EXCHANGE)) goto put_names; if (flags & RENAME_EXCHANGE) target_flags = 0; retry: error = filename_parentat(olddfd, from, lookup_flags, &old_path, &old_last, &old_type); if (error) goto put_names; error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, &new_type); if (error) goto exit1; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto exit2; error = -EBUSY; if (old_type != LAST_NORM) goto exit2; if (flags & RENAME_NOREPLACE) error = -EEXIST; if (new_type != LAST_NORM) goto exit2; error = mnt_want_write(old_path.mnt); if (error) goto exit2; retry_deleg: trap = lock_rename(new_path.dentry, old_path.dentry); if (IS_ERR(trap)) { error = PTR_ERR(trap); goto exit_lock_rename; } old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry, lookup_flags); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; /* source must exist */ error = -ENOENT; if (d_is_negative(old_dentry)) goto exit4; new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; error = -EEXIST; if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) goto exit5; if (flags & RENAME_EXCHANGE) { error = -ENOENT; if (d_is_negative(new_dentry)) goto exit5; if (!d_is_dir(new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) goto exit5; } } /* unless the source is a directory trailing slashes give -ENOTDIR */ if (!d_is_dir(old_dentry)) { error = -ENOTDIR; if (old_last.name[old_last.len]) goto exit5; if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) goto exit5; } /* source should not be ancestor of target */ error = -EINVAL; if (old_dentry == trap) goto exit5; /* target should not be an ancestor of source */ if (!(flags & RENAME_EXCHANGE)) error = -ENOTEMPTY; if (new_dentry == trap) goto exit5; error = security_path_rename(&old_path, old_dentry, &new_path, new_dentry, flags); if (error) goto exit5; rd.old_dir = old_path.dentry->d_inode; rd.old_dentry = old_dentry; rd.old_mnt_idmap = mnt_idmap(old_path.mnt); rd.new_dir = new_path.dentry->d_inode; rd.new_dentry = new_dentry; rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_path.dentry, old_path.dentry); exit_lock_rename: if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(old_path.mnt); exit2: if (retry_estale(error, lookup_flags)) should_retry = true; path_put(&new_path); exit1: path_put(&old_path); if (should_retry) { should_retry = false; lookup_flags |= LOOKUP_REVAL; goto retry; } put_names: putname(from); putname(to); return error; } SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) { return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), flags); } SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 0); } SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); if (IS_ERR(link)) goto out; len = strlen(link); if (len > (unsigned) buflen) len = buflen; if (copy_to_user(buffer, link, len)) len = -EFAULT; out: return len; } /** * vfs_readlink - copy symlink body into userspace buffer * @dentry: dentry on which to get symbolic link * @buffer: user memory pointer * @buflen: size of buffer * * Does not touch atime. That's up to the caller if necessary * * Does not call security hook. */ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = d_inode(dentry); DEFINE_DELAYED_CALL(done); const char *link; int res; if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(inode->i_op->readlink)) return inode->i_op->readlink(dentry, buffer, buflen); if (!d_is_symlink(dentry)) return -EINVAL; spin_lock(&inode->i_lock); inode->i_opflags |= IOP_DEFAULT_READLINK; spin_unlock(&inode->i_lock); } link = READ_ONCE(inode->i_link); if (!link) { link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } res = readlink_copy(buffer, buflen, link); do_delayed_call(&done); return res; } EXPORT_SYMBOL(vfs_readlink); /** * vfs_get_link - get symlink body * @dentry: dentry on which to get symbolic link * @done: caller needs to free returned data with this * * Calls security hook and i_op->get_link() on the supplied inode. * * It does not touch atime. That's up to the caller if necessary. * * Does not work on "special" symlinks like /proc/$$/fd/N */ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) { const char *res = ERR_PTR(-EINVAL); struct inode *inode = d_inode(dentry); if (d_is_symlink(dentry)) { res = ERR_PTR(security_inode_readlink(dentry)); if (!res) res = inode->i_op->get_link(dentry, inode, done); } return res; } EXPORT_SYMBOL(vfs_get_link); /* get the link contents into pagecache */ const char *page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { char *kaddr; struct page *page; struct address_space *mapping = inode->i_mapping; if (!dentry) { page = find_get_page(mapping, 0); if (!page) return ERR_PTR(-ECHILD); if (!PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } } else { page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) return (char*)page; } set_delayed_call(callback, page_put_link, page); BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); kaddr = page_address(page); nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); return kaddr; } EXPORT_SYMBOL(page_get_link); void page_put_link(void *arg) { put_page(arg); } EXPORT_SYMBOL(page_put_link); int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { DEFINE_DELAYED_CALL(done); int res = readlink_copy(buffer, buflen, page_get_link(dentry, d_inode(dentry), &done)); do_delayed_call(&done); return res; } EXPORT_SYMBOL(page_readlink); int page_symlink(struct inode *inode, const char *symname, int len) { struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); struct folio *folio; void *fsdata = NULL; int err; unsigned int flags; retry: if (nofs) flags = memalloc_nofs_save(); err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata); if (nofs) memalloc_nofs_restore(flags); if (err) goto fail; memcpy(folio_address(folio), symname, len - 1); err = aops->write_end(NULL, mapping, 0, len - 1, len - 1, folio, fsdata); if (err < 0) goto fail; if (err < len-1) goto retry; mark_inode_dirty(inode); return 0; fail: return err; } EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .get_link = page_get_link, }; EXPORT_SYMBOL(page_symlink_inode_operations);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 // SPDX-License-Identifier: GPL-2.0 /* net/atm/svc.c - ATM SVC sockets */ /* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ #include <linux/string.h> #include <linux/net.h> /* struct socket, struct proto_ops */ #include <linux/errno.h> /* error codes */ #include <linux/kernel.h> /* printk */ #include <linux/skbuff.h> #include <linux/wait.h> #include <linux/sched/signal.h> #include <linux/fcntl.h> /* O_NONBLOCK */ #include <linux/init.h> #include <linux/atm.h> /* ATM stuff */ #include <linux/atmsap.h> #include <linux/atmsvc.h> #include <linux/atmdev.h> #include <linux/bitops.h> #include <net/sock.h> /* for sock_no_* */ #include <linux/uaccess.h> #include <linux/export.h> #include "resources.h" #include "common.h" /* common for PVCs and SVCs */ #include "signaling.h" #include "addr.h" #ifdef CONFIG_COMPAT /* It actually takes struct sockaddr_atmsvc, not struct atm_iobuf */ #define COMPAT_ATM_ADDPARTY _IOW('a', ATMIOC_SPECIAL + 4, struct compat_atm_iobuf) #endif static int svc_create(struct net *net, struct socket *sock, int protocol, int kern); /* * Note: since all this is still nicely synchronized with the signaling demon, * there's no need to protect sleep loops with clis. If signaling is * moved into the kernel, that would change. */ static int svc_shutdown(struct socket *sock, int how) { return 0; } static void svc_disconnect(struct atm_vcc *vcc) { DEFINE_WAIT(wait); struct sk_buff *skb; struct sock *sk = sk_atm(vcc); pr_debug("%p\n", vcc); if (test_bit(ATM_VF_REGIS, &vcc->flags)) { sigd_enq(vcc, as_close, NULL, NULL, NULL); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); if (test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); } /* beware - socket is still in use by atmsigd until the last as_indicate has been answered */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { atm_return(vcc, skb->truesize); pr_debug("LISTEN REL\n"); sigd_enq2(NULL, as_reject, vcc, NULL, NULL, &vcc->qos, 0); dev_kfree_skb(skb); } clear_bit(ATM_VF_REGIS, &vcc->flags); /* ... may retry later */ } static int svc_release(struct socket *sock) { struct sock *sk = sock->sk; struct atm_vcc *vcc; if (sk) { vcc = ATM_SD(sock); pr_debug("%p\n", vcc); clear_bit(ATM_VF_READY, &vcc->flags); /* * VCC pointer is used as a reference, * so we must not free it (thereby subjecting it to re-use) * before all pending connections are closed */ svc_disconnect(vcc); vcc_release(sock); } return 0; } static int svc_bind(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct sockaddr_atmsvc *addr; struct atm_vcc *vcc; int error; if (sockaddr_len != sizeof(struct sockaddr_atmsvc)) return -EINVAL; lock_sock(sk); if (sock->state == SS_CONNECTED) { error = -EISCONN; goto out; } if (sock->state != SS_UNCONNECTED) { error = -EINVAL; goto out; } vcc = ATM_SD(sock); addr = (struct sockaddr_atmsvc *) sockaddr; if (addr->sas_family != AF_ATMSVC) { error = -EAFNOSUPPORT; goto out; } clear_bit(ATM_VF_BOUND, &vcc->flags); /* failing rebind will kill old binding */ /* @@@ check memory (de)allocation on rebind */ if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) { error = -EBADFD; goto out; } vcc->local = *addr; set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq(vcc, as_bind, NULL, NULL, &vcc->local); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); clear_bit(ATM_VF_REGIS, &vcc->flags); /* doesn't count */ if (!sigd) { error = -EUNATCH; goto out; } if (!sk->sk_err) set_bit(ATM_VF_BOUND, &vcc->flags); error = -sk->sk_err; out: release_sock(sk); return error; } static int svc_connect(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len, int flags) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct sockaddr_atmsvc *addr; struct atm_vcc *vcc = ATM_SD(sock); int error; pr_debug("%p\n", vcc); lock_sock(sk); if (sockaddr_len != sizeof(struct sockaddr_atmsvc)) { error = -EINVAL; goto out; } switch (sock->state) { default: error = -EINVAL; goto out; case SS_CONNECTED: error = -EISCONN; goto out; case SS_CONNECTING: if (test_bit(ATM_VF_WAITING, &vcc->flags)) { error = -EALREADY; goto out; } sock->state = SS_UNCONNECTED; if (sk->sk_err) { error = -sk->sk_err; goto out; } break; case SS_UNCONNECTED: addr = (struct sockaddr_atmsvc *) sockaddr; if (addr->sas_family != AF_ATMSVC) { error = -EAFNOSUPPORT; goto out; } if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) { error = -EBADFD; goto out; } if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS || vcc->qos.rxtp.traffic_class == ATM_ANYCLASS) { error = -EINVAL; goto out; } if (!vcc->qos.txtp.traffic_class && !vcc->qos.rxtp.traffic_class) { error = -EINVAL; goto out; } vcc->remote = *addr; set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq(vcc, as_connect, NULL, NULL, &vcc->remote); if (flags & O_NONBLOCK) { sock->state = SS_CONNECTING; error = -EINPROGRESS; goto out; } error = 0; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { schedule(); if (!signal_pending(current)) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); continue; } pr_debug("*ABORT*\n"); /* * This is tricky: * Kernel ---close--> Demon * Kernel <--close--- Demon * or * Kernel ---close--> Demon * Kernel <--error--- Demon * or * Kernel ---close--> Demon * Kernel <--okay---- Demon * Kernel <--close--- Demon */ sigd_enq(vcc, as_close, NULL, NULL, NULL); while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); schedule(); } if (!sk->sk_err) while (!test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); schedule(); } clear_bit(ATM_VF_REGIS, &vcc->flags); clear_bit(ATM_VF_RELEASED, &vcc->flags); clear_bit(ATM_VF_CLOSE, &vcc->flags); /* we're gone now but may connect later */ error = -EINTR; break; } finish_wait(sk_sleep(sk), &wait); if (error) goto out; if (!sigd) { error = -EUNATCH; goto out; } if (sk->sk_err) { error = -sk->sk_err; goto out; } } vcc->qos.txtp.max_pcr = SELECT_TOP_PCR(vcc->qos.txtp); vcc->qos.txtp.pcr = 0; vcc->qos.txtp.min_pcr = 0; error = vcc_connect(sock, vcc->itf, vcc->vpi, vcc->vci); if (!error) sock->state = SS_CONNECTED; else (void)svc_disconnect(vcc); out: release_sock(sk); return error; } static int svc_listen(struct socket *sock, int backlog) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); int error; pr_debug("%p\n", vcc); lock_sock(sk); /* let server handle listen on unbound sockets */ if (test_bit(ATM_VF_SESSION, &vcc->flags)) { error = -EINVAL; goto out; } if (test_bit(ATM_VF_LISTEN, &vcc->flags)) { error = -EADDRINUSE; goto out; } set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq(vcc, as_listen, NULL, NULL, &vcc->local); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) { error = -EUNATCH; goto out; } set_bit(ATM_VF_LISTEN, &vcc->flags); vcc_insert_socket(sk); sk->sk_max_ack_backlog = backlog > 0 ? backlog : ATM_BACKLOG_DEFAULT; error = -sk->sk_err; out: release_sock(sk); return error; } static int svc_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sk_buff *skb; struct atmsvc_msg *msg; struct atm_vcc *old_vcc = ATM_SD(sock); struct atm_vcc *new_vcc; int error; lock_sock(sk); error = svc_create(sock_net(sk), newsock, 0, arg->kern); if (error) goto out; new_vcc = ATM_SD(newsock); pr_debug("%p -> %p\n", old_vcc, new_vcc); while (1) { DEFINE_WAIT(wait); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (!(skb = skb_dequeue(&sk->sk_receive_queue)) && sigd) { if (test_bit(ATM_VF_RELEASED, &old_vcc->flags)) break; if (test_bit(ATM_VF_CLOSE, &old_vcc->flags)) { error = -sk->sk_err; break; } if (arg->flags & O_NONBLOCK) { error = -EAGAIN; break; } release_sock(sk); schedule(); lock_sock(sk); if (signal_pending(current)) { error = -ERESTARTSYS; break; } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } finish_wait(sk_sleep(sk), &wait); if (error) goto out; if (!skb) { error = -EUNATCH; goto out; } msg = (struct atmsvc_msg *)skb->data; new_vcc->qos = msg->qos; set_bit(ATM_VF_HASQOS, &new_vcc->flags); new_vcc->remote = msg->svc; new_vcc->local = msg->local; new_vcc->sap = msg->sap; error = vcc_connect(newsock, msg->pvc.sap_addr.itf, msg->pvc.sap_addr.vpi, msg->pvc.sap_addr.vci); dev_kfree_skb(skb); sk_acceptq_removed(sk); if (error) { sigd_enq2(NULL, as_reject, old_vcc, NULL, NULL, &old_vcc->qos, error); error = error == -EAGAIN ? -EBUSY : error; goto out; } /* wait should be short, so we ignore the non-blocking flag */ set_bit(ATM_VF_WAITING, &new_vcc->flags); sigd_enq(new_vcc, as_accept, old_vcc, NULL, NULL); for (;;) { prepare_to_wait(sk_sleep(sk_atm(new_vcc)), &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &new_vcc->flags) || !sigd) break; release_sock(sk); schedule(); lock_sock(sk); } finish_wait(sk_sleep(sk_atm(new_vcc)), &wait); if (!sigd) { error = -EUNATCH; goto out; } if (!sk_atm(new_vcc)->sk_err) break; if (sk_atm(new_vcc)->sk_err != ERESTARTSYS) { error = -sk_atm(new_vcc)->sk_err; goto out; } } newsock->state = SS_CONNECTED; out: release_sock(sk); return error; } static int svc_getname(struct socket *sock, struct sockaddr *sockaddr, int peer) { struct sockaddr_atmsvc *addr; addr = (struct sockaddr_atmsvc *) sockaddr; memcpy(addr, peer ? &ATM_SD(sock)->remote : &ATM_SD(sock)->local, sizeof(struct sockaddr_atmsvc)); return sizeof(struct sockaddr_atmsvc); } int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos) { struct sock *sk = sk_atm(vcc); DEFINE_WAIT(wait); set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq2(vcc, as_modify, NULL, NULL, &vcc->local, qos, 0); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || test_bit(ATM_VF_RELEASED, &vcc->flags) || !sigd) { break; } schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) return -EUNATCH; return -sk->sk_err; } static int svc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); int value, error = 0; lock_sock(sk); switch (optname) { case SO_ATMSAP: if (level != SOL_ATM || optlen != sizeof(struct atm_sap)) { error = -EINVAL; goto out; } if (copy_from_sockptr(&vcc->sap, optval, optlen)) { error = -EFAULT; goto out; } set_bit(ATM_VF_HASSAP, &vcc->flags); break; case SO_MULTIPOINT: if (level != SOL_ATM || optlen != sizeof(int)) { error = -EINVAL; goto out; } if (copy_from_sockptr(&value, optval, sizeof(int))) { error = -EFAULT; goto out; } if (value == 1) set_bit(ATM_VF_SESSION, &vcc->flags); else if (value == 0) clear_bit(ATM_VF_SESSION, &vcc->flags); else error = -EINVAL; break; default: error = vcc_setsockopt(sock, level, optname, optval, optlen); } out: release_sock(sk); return error; } static int svc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int error = 0, len; lock_sock(sk); if (!__SO_LEVEL_MATCH(optname, level) || optname != SO_ATMSAP) { error = vcc_getsockopt(sock, level, optname, optval, optlen); goto out; } if (get_user(len, optlen)) { error = -EFAULT; goto out; } if (len != sizeof(struct atm_sap)) { error = -EINVAL; goto out; } if (copy_to_user(optval, &ATM_SD(sock)->sap, sizeof(struct atm_sap))) { error = -EFAULT; goto out; } out: release_sock(sk); return error; } static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len, int flags) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); int error; lock_sock(sk); set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq(vcc, as_addparty, NULL, NULL, (struct sockaddr_atmsvc *) sockaddr); if (flags & O_NONBLOCK) { error = -EINPROGRESS; goto out; } pr_debug("added wait queue\n"); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); error = -xchg(&sk->sk_err_soft, 0); out: release_sock(sk); return error; } static int svc_dropparty(struct socket *sock, int ep_ref) { DEFINE_WAIT(wait); struct sock *sk = sock->sk; struct atm_vcc *vcc = ATM_SD(sock); int error; lock_sock(sk); set_bit(ATM_VF_WAITING, &vcc->flags); sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (!test_bit(ATM_VF_WAITING, &vcc->flags) || !sigd) break; schedule(); } finish_wait(sk_sleep(sk), &wait); if (!sigd) { error = -EUNATCH; goto out; } error = -xchg(&sk->sk_err_soft, 0); out: release_sock(sk); return error; } static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int error, ep_ref; struct sockaddr_atmsvc sa; struct atm_vcc *vcc = ATM_SD(sock); switch (cmd) { case ATM_ADDPARTY: if (!test_bit(ATM_VF_SESSION, &vcc->flags)) return -EINVAL; if (copy_from_user(&sa, (void __user *) arg, sizeof(sa))) return -EFAULT; error = svc_addparty(sock, (struct sockaddr *)&sa, sizeof(sa), 0); break; case ATM_DROPPARTY: if (!test_bit(ATM_VF_SESSION, &vcc->flags)) return -EINVAL; if (copy_from_user(&ep_ref, (void __user *) arg, sizeof(int))) return -EFAULT; error = svc_dropparty(sock, ep_ref); break; default: error = vcc_ioctl(sock, cmd, arg); } return error; } #ifdef CONFIG_COMPAT static int svc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { /* The definition of ATM_ADDPARTY uses the size of struct atm_iobuf. But actually it takes a struct sockaddr_atmsvc, which doesn't need compat handling. So all we have to do is fix up cmd... */ if (cmd == COMPAT_ATM_ADDPARTY) cmd = ATM_ADDPARTY; if (cmd == ATM_ADDPARTY || cmd == ATM_DROPPARTY) return svc_ioctl(sock, cmd, arg); else return vcc_compat_ioctl(sock, cmd, arg); } #endif /* CONFIG_COMPAT */ static const struct proto_ops svc_proto_ops = { .family = PF_ATMSVC, .owner = THIS_MODULE, .release = svc_release, .bind = svc_bind, .connect = svc_connect, .socketpair = sock_no_socketpair, .accept = svc_accept, .getname = svc_getname, .poll = vcc_poll, .ioctl = svc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = svc_compat_ioctl, #endif .gettstamp = sock_gettstamp, .listen = svc_listen, .shutdown = svc_shutdown, .setsockopt = svc_setsockopt, .getsockopt = svc_getsockopt, .sendmsg = vcc_sendmsg, .recvmsg = vcc_recvmsg, .mmap = sock_no_mmap, }; static int svc_create(struct net *net, struct socket *sock, int protocol, int kern) { int error; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; sock->ops = &svc_proto_ops; error = vcc_create(net, sock, protocol, AF_ATMSVC, kern); if (error) return error; ATM_SD(sock)->local.sas_family = AF_ATMSVC; ATM_SD(sock)->remote.sas_family = AF_ATMSVC; return 0; } static const struct net_proto_family svc_family_ops = { .family = PF_ATMSVC, .create = svc_create, .owner = THIS_MODULE, }; /* * Initialize the ATM SVC protocol family */ int __init atmsvc_init(void) { return sock_register(&svc_family_ops); } void atmsvc_exit(void) { sock_unregister(PF_ATMSVC); }
8 6 4 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/netrom.h> /* * This is where all NET/ROM frames pass, except for IP-over-NET/ROM which * cannot be fragmented in this manner. */ void nr_output(struct sock *sk, struct sk_buff *skb) { struct sk_buff *skbn; unsigned char transport[NR_TRANSPORT_LEN]; int err, frontlen, len; if (skb->len - NR_TRANSPORT_LEN > NR_MAX_PACKET_SIZE) { /* Save a copy of the Transport Header */ skb_copy_from_linear_data(skb, transport, NR_TRANSPORT_LEN); skb_pull(skb, NR_TRANSPORT_LEN); frontlen = skb_headroom(skb); while (skb->len > 0) { if ((skbn = sock_alloc_send_skb(sk, frontlen + NR_MAX_PACKET_SIZE, 0, &err)) == NULL) return; skb_reserve(skbn, frontlen); len = (NR_MAX_PACKET_SIZE > skb->len) ? skb->len : NR_MAX_PACKET_SIZE; /* Copy the user data */ skb_copy_from_linear_data(skb, skb_put(skbn, len), len); skb_pull(skb, len); /* Duplicate the Transport Header */ skb_push(skbn, NR_TRANSPORT_LEN); skb_copy_to_linear_data(skbn, transport, NR_TRANSPORT_LEN); if (skb->len > 0) skbn->data[4] |= NR_MORE_FLAG; skb_queue_tail(&sk->sk_write_queue, skbn); /* Throw it on the queue */ } kfree_skb(skb); } else { skb_queue_tail(&sk->sk_write_queue, skb); /* Throw it on the queue */ } nr_kick(sk); } /* * This procedure is passed a buffer descriptor for an iframe. It builds * the rest of the control part of the frame and then writes it out. */ static void nr_send_iframe(struct sock *sk, struct sk_buff *skb) { struct nr_sock *nr = nr_sk(sk); if (skb == NULL) return; skb->data[2] = nr->vs; skb->data[3] = nr->vr; if (nr->condition & NR_COND_OWN_RX_BUSY) skb->data[4] |= NR_CHOKE_FLAG; nr_start_idletimer(sk); nr_transmit_buffer(sk, skb); } void nr_send_nak_frame(struct sock *sk) { struct sk_buff *skb, *skbn; struct nr_sock *nr = nr_sk(sk); if ((skb = skb_peek(&nr->ack_queue)) == NULL) return; if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) return; skbn->data[2] = nr->va; skbn->data[3] = nr->vr; if (nr->condition & NR_COND_OWN_RX_BUSY) skbn->data[4] |= NR_CHOKE_FLAG; nr_transmit_buffer(sk, skbn); nr->condition &= ~NR_COND_ACK_PENDING; nr->vl = nr->vr; nr_stop_t1timer(sk); } void nr_kick(struct sock *sk) { struct nr_sock *nr = nr_sk(sk); struct sk_buff *skb, *skbn; unsigned short start, end; if (nr->state != NR_STATE_3) return; if (nr->condition & NR_COND_PEER_RX_BUSY) return; if (!skb_peek(&sk->sk_write_queue)) return; start = (skb_peek(&nr->ack_queue) == NULL) ? nr->va : nr->vs; end = (nr->va + nr->window) % NR_MODULUS; if (start == end) return; nr->vs = start; /* * Transmit data until either we're out of data to send or * the window is full. */ /* * Dequeue the frame and copy it. */ skb = skb_dequeue(&sk->sk_write_queue); do { if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { skb_queue_head(&sk->sk_write_queue, skb); break; } skb_set_owner_w(skbn, sk); /* * Transmit the frame copy. */ nr_send_iframe(sk, skbn); nr->vs = (nr->vs + 1) % NR_MODULUS; /* * Requeue the original data frame. */ skb_queue_tail(&nr->ack_queue, skb); } while (nr->vs != end && (skb = skb_dequeue(&sk->sk_write_queue)) != NULL); nr->vl = nr->vr; nr->condition &= ~NR_COND_ACK_PENDING; if (!nr_t1timer_running(sk)) nr_start_t1timer(sk); } void nr_transmit_buffer(struct sock *sk, struct sk_buff *skb) { struct nr_sock *nr = nr_sk(sk); unsigned char *dptr; /* * Add the protocol byte and network header. */ dptr = skb_push(skb, NR_NETWORK_LEN); memcpy(dptr, &nr->source_addr, AX25_ADDR_LEN); dptr[6] &= ~AX25_CBIT; dptr[6] &= ~AX25_EBIT; dptr[6] |= AX25_SSSID_SPARE; dptr += AX25_ADDR_LEN; memcpy(dptr, &nr->dest_addr, AX25_ADDR_LEN); dptr[6] &= ~AX25_CBIT; dptr[6] |= AX25_EBIT; dptr[6] |= AX25_SSSID_SPARE; dptr += AX25_ADDR_LEN; *dptr++ = READ_ONCE(sysctl_netrom_network_ttl_initialiser); if (!nr_route_frame(skb, NULL)) { kfree_skb(skb); nr_disconnect(sk, ENETUNREACH); } } /* * The following routines are taken from page 170 of the 7th ARRL Computer * Networking Conference paper, as is the whole state machine. */ void nr_establish_data_link(struct sock *sk) { struct nr_sock *nr = nr_sk(sk); nr->condition = 0x00; nr->n2count = 0; nr_write_internal(sk, NR_CONNREQ); nr_stop_t2timer(sk); nr_stop_t4timer(sk); nr_stop_idletimer(sk); nr_start_t1timer(sk); } /* * Never send a NAK when we are CHOKEd. */ void nr_enquiry_response(struct sock *sk) { struct nr_sock *nr = nr_sk(sk); int frametype = NR_INFOACK; if (nr->condition & NR_COND_OWN_RX_BUSY) { frametype |= NR_CHOKE_FLAG; } else { if (skb_peek(&nr->reseq_queue) != NULL) frametype |= NR_NAK_FLAG; } nr_write_internal(sk, frametype); nr->vl = nr->vr; nr->condition &= ~NR_COND_ACK_PENDING; } void nr_check_iframes_acked(struct sock *sk, unsigned short nr) { struct nr_sock *nrom = nr_sk(sk); if (nrom->vs == nr) { nr_frames_acked(sk, nr); nr_stop_t1timer(sk); nrom->n2count = 0; } else { if (nrom->va != nr) { nr_frames_acked(sk, nr); nr_start_t1timer(sk); } } }
19 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PARAVIRT_H #define _ASM_X86_PARAVIRT_H /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ #include <asm/paravirt_types.h> #ifndef __ASSEMBLY__ struct mm_struct; #endif #ifdef CONFIG_PARAVIRT #include <asm/pgtable_types.h> #include <asm/asm.h> #include <asm/nospec-branch.h> #ifndef __ASSEMBLY__ #include <linux/bug.h> #include <linux/types.h> #include <linux/cpumask.h> #include <linux/static_call_types.h> #include <asm/frame.h> u64 dummy_steal_clock(int cpu); u64 dummy_sched_clock(void); DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock); void paravirt_set_sched_clock(u64 (*func)(void)); static __always_inline u64 paravirt_sched_clock(void) { return static_call(pv_sched_clock)(); } struct static_key; extern struct static_key paravirt_steal_enabled; extern struct static_key paravirt_steal_rq_enabled; __visible void __native_queued_spin_unlock(struct qspinlock *lock); bool pv_is_native_spin_unlock(void); __visible bool __native_vcpu_is_preempted(long cpu); bool pv_is_native_vcpu_is_preempted(void); static inline u64 paravirt_steal_clock(int cpu) { return static_call(pv_steal_clock)(cpu); } #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init paravirt_set_cap(void); #endif /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { PVOP_VCALL0(cpu.io_delay); #ifdef REALLY_SLOW_IO PVOP_VCALL0(cpu.io_delay); PVOP_VCALL0(cpu.io_delay); PVOP_VCALL0(cpu.io_delay); #endif } void native_flush_tlb_local(void); void native_flush_tlb_global(void); void native_flush_tlb_one_user(unsigned long addr); void native_flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info); static inline void __flush_tlb_local(void) { PVOP_VCALL0(mmu.flush_tlb_user); } static inline void __flush_tlb_global(void) { PVOP_VCALL0(mmu.flush_tlb_kernel); } static inline void __flush_tlb_one_user(unsigned long addr) { PVOP_VCALL1(mmu.flush_tlb_one_user, addr); } static inline void __flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info) { PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); } static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) { PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); } static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(mmu.exit_mmap, mm); } static inline void notify_page_enc_status_changed(unsigned long pfn, int npages, bool enc) { PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc); } #ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { PVOP_VCALL1(cpu.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx); } /* * These special macros can be used to get or set a debugging register */ static __always_inline unsigned long paravirt_get_debugreg(int reg) { return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg); } #define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) static __always_inline void set_debugreg(unsigned long val, int reg) { PVOP_VCALL2(cpu.set_debugreg, reg, val); } static inline unsigned long read_cr0(void) { return PVOP_CALL0(unsigned long, cpu.read_cr0); } static inline void write_cr0(unsigned long x) { PVOP_VCALL1(cpu.write_cr0, x); } static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, "mov %%cr2, %%rax;", ALT_NOT_XEN); } static __always_inline void write_cr2(unsigned long x) { PVOP_VCALL1(mmu.write_cr2, x); } static inline unsigned long __read_cr3(void) { return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3, "mov %%cr3, %%rax;", ALT_NOT_XEN); } static inline void write_cr3(unsigned long x) { PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN); } static inline void __write_cr4(unsigned long x) { PVOP_VCALL1(cpu.write_cr4, x); } static __always_inline void arch_safe_halt(void) { PVOP_VCALL0(irq.safe_halt); } static inline void halt(void) { PVOP_VCALL0(irq.halt); } extern noinstr void pv_native_wbinvd(void); static __always_inline void wbinvd(void) { PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN); } static inline u64 paravirt_read_msr(unsigned msr) { return PVOP_CALL1(u64, cpu.read_msr, msr); } static inline void paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { PVOP_VCALL3(cpu.write_msr, msr, low, high); } static inline u64 paravirt_read_msr_safe(unsigned msr, int *err) { return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err); } static inline int paravirt_write_msr_safe(unsigned msr, unsigned low, unsigned high) { return PVOP_CALL3(int, cpu.write_msr_safe, msr, low, high); } #define rdmsr(msr, val1, val2) \ do { \ u64 _l = paravirt_read_msr(msr); \ val1 = (u32)_l; \ val2 = _l >> 32; \ } while (0) #define wrmsr(msr, val1, val2) \ do { \ paravirt_write_msr(msr, val1, val2); \ } while (0) #define rdmsrl(msr, val) \ do { \ val = paravirt_read_msr(msr); \ } while (0) static inline void wrmsrl(unsigned msr, u64 val) { wrmsr(msr, (u32)val, (u32)(val>>32)); } #define wrmsr_safe(msr, a, b) paravirt_write_msr_safe(msr, a, b) /* rdmsr with exception handling */ #define rdmsr_safe(msr, a, b) \ ({ \ int _err; \ u64 _l = paravirt_read_msr_safe(msr, &_err); \ (*a) = (u32)_l; \ (*b) = _l >> 32; \ _err; \ }) static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; *p = paravirt_read_msr_safe(msr, &err); return err; } static inline unsigned long long paravirt_read_pmc(int counter) { return PVOP_CALL1(u64, cpu.read_pmc, counter); } #define rdpmc(counter, low, high) \ do { \ u64 _l = paravirt_read_pmc(counter); \ low = (u32)_l; \ high = _l >> 32; \ } while (0) #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.alloc_ldt, ldt, entries); } static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.free_ldt, ldt, entries); } static inline void load_TR_desc(void) { PVOP_VCALL0(cpu.load_tr_desc); } static inline void load_gdt(const struct desc_ptr *dtr) { PVOP_VCALL1(cpu.load_gdt, dtr); } static inline void load_idt(const struct desc_ptr *dtr) { PVOP_VCALL1(cpu.load_idt, dtr); } static inline void set_ldt(const void *addr, unsigned entries) { PVOP_VCALL2(cpu.set_ldt, addr, entries); } static inline unsigned long paravirt_store_tr(void) { return PVOP_CALL0(unsigned long, cpu.store_tr); } #define store_tr(tr) ((tr) = paravirt_store_tr()) static inline void load_TLS(struct thread_struct *t, unsigned cpu) { PVOP_VCALL2(cpu.load_tls, t, cpu); } static inline void load_gs_index(unsigned int gs) { PVOP_VCALL1(cpu.load_gs_index, gs); } static inline void write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { PVOP_VCALL3(cpu.write_ldt_entry, dt, entry, desc); } static inline void write_gdt_entry(struct desc_struct *dt, int entry, void *desc, int type) { PVOP_VCALL4(cpu.write_gdt_entry, dt, entry, desc, type); } static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) { PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g); } #ifdef CONFIG_X86_IOPL_IOPERM static inline void tss_invalidate_io_bitmap(void) { PVOP_VCALL0(cpu.invalidate_io_bitmap); } static inline void tss_update_io_bitmap(void) { PVOP_VCALL0(cpu.update_io_bitmap); } #endif static inline void paravirt_enter_mmap(struct mm_struct *next) { PVOP_VCALL1(mmu.enter_mmap, next); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return PVOP_CALL1(int, mmu.pgd_alloc, mm); } static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) { PVOP_VCALL2(mmu.pgd_free, mm, pgd); } static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pte, mm, pfn); } static inline void paravirt_release_pte(unsigned long pfn) { PVOP_VCALL1(mmu.release_pte, pfn); } static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pmd, mm, pfn); } static inline void paravirt_release_pmd(unsigned long pfn) { PVOP_VCALL1(mmu.release_pmd, pfn); } static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pud, mm, pfn); } static inline void paravirt_release_pud(unsigned long pfn) { PVOP_VCALL1(mmu.release_pud, pfn); } static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_p4d, mm, pfn); } static inline void paravirt_release_p4d(unsigned long pfn) { PVOP_VCALL1(mmu.release_p4d, pfn); } static inline pte_t __pte(pteval_t val) { return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pteval_t pte_val(pte_t pte) { return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline pgd_t __pgd(pgdval_t val) { return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pgdval_t pgd_val(pgd_t pgd) { return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd, "mov %%rdi, %%rax", ALT_NOT_XEN); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pteval_t ret; ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep); return (pte_t) { .pte = ret }; } static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { PVOP_VCALL4(mmu.ptep_modify_prot_commit, vma, addr, ptep, pte.pte); } static inline void set_pte(pte_t *ptep, pte_t pte) { PVOP_VCALL2(mmu.set_pte, ptep, pte.pte); } static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { PVOP_VCALL2(mmu.set_pmd, pmdp, native_pmd_val(pmd)); } static inline pmd_t __pmd(pmdval_t val) { return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pmdval_t pmd_val(pmd_t pmd) { return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void set_pud(pud_t *pudp, pud_t pud) { PVOP_VCALL2(mmu.set_pud, pudp, native_pud_val(pud)); } static inline pud_t __pud(pudval_t val) { pudval_t ret; ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val, "mov %%rdi, %%rax", ALT_NOT_XEN); return (pud_t) { ret }; } static inline pudval_t pud_val(pud_t pud) { return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void pud_clear(pud_t *pudp) { set_pud(pudp, native_make_pud(0)); } static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { p4dval_t val = native_p4d_val(p4d); PVOP_VCALL2(mmu.set_p4d, p4dp, val); } #if CONFIG_PGTABLE_LEVELS >= 5 static inline p4d_t __p4d(p4dval_t val) { p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, "mov %%rdi, %%rax", ALT_NOT_XEN); return (p4d_t) { ret }; } static inline p4dval_t p4d_val(p4d_t p4d) { return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) { PVOP_VCALL2(mmu.set_pgd, pgdp, native_pgd_val(pgd)); } #define set_pgd(pgdp, pgdval) do { \ if (pgtable_l5_enabled()) \ __set_pgd(pgdp, pgdval); \ else \ set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \ } while (0) #define pgd_clear(pgdp) do { \ if (pgtable_l5_enabled()) \ set_pgd(pgdp, native_make_pgd(0)); \ } while (0) #endif /* CONFIG_PGTABLE_LEVELS == 5 */ static inline void p4d_clear(p4d_t *p4dp) { set_p4d(p4dp, native_make_p4d(0)); } static inline void set_pte_atomic(pte_t *ptep, pte_t pte) { set_pte(ptep, pte); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { set_pte(ptep, native_make_pte(0)); } static inline void pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, native_make_pmd(0)); } #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { PVOP_VCALL1(cpu.start_context_switch, prev); } static inline void arch_end_context_switch(struct task_struct *next) { PVOP_VCALL1(cpu.end_context_switch, next); } #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.enter); } static inline void arch_leave_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.leave); } static inline void arch_flush_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.flush); } static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { pv_ops.mmu.set_fixmap(idx, phys, flags); } #endif #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { PVOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val); } static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) { PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock, "movb $0, (%%" _ASM_ARG1 ");", ALT_NOT(X86_FEATURE_PVUNLOCK)); } static __always_inline void pv_wait(u8 *ptr, u8 val) { PVOP_VCALL2(lock.wait, ptr, val); } static __always_inline void pv_kick(int cpu) { PVOP_VCALL1(lock.kick, cpu); } static __always_inline bool pv_vcpu_is_preempted(long cpu) { return PVOP_ALT_CALLEE1(bool, lock.vcpu_is_preempted, cpu, "xor %%" _ASM_AX ", %%" _ASM_AX ";", ALT_NOT(X86_FEATURE_VCPUPREEMPT)); } void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock); bool __raw_callee_save___native_vcpu_is_preempted(long cpu); #endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" #define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" #else /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS \ "push %rcx;" \ "push %rdx;" \ "push %rsi;" \ "push %rdi;" \ "push %r8;" \ "push %r9;" \ "push %r10;" \ "push %r11;" #define PV_RESTORE_ALL_CALLER_REGS \ "pop %r11;" \ "pop %r10;" \ "pop %r9;" \ "pop %r8;" \ "pop %rdi;" \ "pop %rsi;" \ "pop %rdx;" \ "pop %rcx;" #endif /* * Generate a thunk around a function which saves all caller-save * registers except for the return value. This allows C functions to * be called from assembler code where fewer than normal registers are * available. It may also help code generation around calls from C * code if the common case doesn't use many registers. * * When a callee is wrapped in a thunk, the caller can assume that all * arg regs and all scratch registers are preserved across the * call. The return value in rax/eax will not be saved, even for void * functions. */ #define PV_THUNK_NAME(func) "__raw_callee_save_" #func #define __PV_CALLEE_SAVE_REGS_THUNK(func, section) \ extern typeof(func) __raw_callee_save_##func; \ \ asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ ASM_FUNC_ALIGN \ PV_THUNK_NAME(func) ":" \ ASM_ENDBR \ FRAME_BEGIN \ PV_SAVE_ALL_CALLER_REGS \ "call " #func ";" \ PV_RESTORE_ALL_CALLER_REGS \ FRAME_END \ ASM_RET \ ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \ ".popsection") #define PV_CALLEE_SAVE_REGS_THUNK(func) \ __PV_CALLEE_SAVE_REGS_THUNK(func, ".text") /* Get a reference to a callee-save function */ #define PV_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { __raw_callee_save_##func }) /* Promise that "func" already uses the right calling convention */ #define __PV_IS_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { func }) #ifdef CONFIG_PARAVIRT_XXL static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_disable(void) { PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_enable(void) { PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN); } static __always_inline unsigned long arch_local_irq_save(void) { unsigned long f; f = arch_local_save_flags(); arch_local_irq_disable(); return f; } #endif /* Make sure as little as possible of this mess escapes. */ #undef PARAVIRT_CALL #undef __PVOP_CALL #undef __PVOP_VCALL #undef PVOP_VCALL0 #undef PVOP_CALL0 #undef PVOP_VCALL1 #undef PVOP_CALL1 #undef PVOP_VCALL2 #undef PVOP_CALL2 #undef PVOP_VCALL3 #undef PVOP_CALL3 #undef PVOP_VCALL4 #undef PVOP_CALL4 extern void default_banner(void); void native_pv_lock_init(void) __init; #else /* __ASSEMBLY__ */ #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL #ifdef CONFIG_DEBUG_ENTRY #define PARA_INDIRECT(addr) *addr(%rip) .macro PARA_IRQ_save_fl ANNOTATE_RETPOLINE_SAFE; call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); .endm #define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \ "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \ "pushf; pop %rax;", ALT_NOT_XEN #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop #ifndef __ASSEMBLY__ static inline void native_pv_lock_init(void) { } #endif #endif /* !CONFIG_PARAVIRT */ #ifndef __ASSEMBLY__ #ifndef CONFIG_PARAVIRT_XXL static inline void paravirt_enter_mmap(struct mm_struct *mm) { } #endif #ifndef CONFIG_PARAVIRT static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { } #endif #ifndef CONFIG_PARAVIRT_SPINLOCKS static inline void paravirt_set_cap(void) { } #endif #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PARAVIRT_H */
110 453 3437 1063 3086 868 121 119 294 218 393 94 325 326 327 34 422 420 293 4 295 293 861 25 856 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_XARRAY_H #define _LINUX_XARRAY_H /* * eXtensible Arrays * Copyright (c) 2017 Microsoft Corporation * Author: Matthew Wilcox <willy@infradead.org> * * See Documentation/core-api/xarray.rst for how to use the XArray. */ #include <linux/bitmap.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/err.h> #include <linux/gfp.h> #include <linux/kconfig.h> #include <linux/limits.h> #include <linux/lockdep.h> #include <linux/rcupdate.h> #include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/types.h> struct list_lru; /* * The bottom two bits of the entry determine how the XArray interprets * the contents: * * 00: Pointer entry * 10: Internal entry * x1: Value entry or tagged pointer * * Attempting to store internal entries in the XArray is a bug. * * Most internal entries are pointers to the next node in the tree. * The following internal entries have a special meaning: * * 0-62: Sibling entries * 256: Retry entry * 257: Zero entry * * Errors are also represented as internal entries, but use the negative * space (-4094 to -2). They're never stored in the slots array; only * returned by the normal API. */ #define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) /** * xa_mk_value() - Create an XArray entry from an integer. * @v: Value to store in XArray. * * Context: Any context. * Return: An entry suitable for storing in the XArray. */ static inline void *xa_mk_value(unsigned long v) { WARN_ON((long)v < 0); return (void *)((v << 1) | 1); } /** * xa_to_value() - Get value stored in an XArray entry. * @entry: XArray entry. * * Context: Any context. * Return: The value stored in the XArray entry. */ static inline unsigned long xa_to_value(const void *entry) { return (unsigned long)entry >> 1; } /** * xa_is_value() - Determine if an entry is a value. * @entry: XArray entry. * * Context: Any context. * Return: True if the entry is a value, false if it is a pointer. */ static inline bool xa_is_value(const void *entry) { return (unsigned long)entry & 1; } /** * xa_tag_pointer() - Create an XArray entry for a tagged pointer. * @p: Plain pointer. * @tag: Tag value (0, 1 or 3). * * If the user of the XArray prefers, they can tag their pointers instead * of storing value entries. Three tags are available (0, 1 and 3). * These are distinct from the xa_mark_t as they are not replicated up * through the array and cannot be searched for. * * Context: Any context. * Return: An XArray entry. */ static inline void *xa_tag_pointer(void *p, unsigned long tag) { return (void *)((unsigned long)p | tag); } /** * xa_untag_pointer() - Turn an XArray entry into a plain pointer. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the untagged version of the pointer. * * Context: Any context. * Return: A pointer. */ static inline void *xa_untag_pointer(void *entry) { return (void *)((unsigned long)entry & ~3UL); } /** * xa_pointer_tag() - Get the tag stored in an XArray entry. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the tag of that pointer. * * Context: Any context. * Return: A tag. */ static inline unsigned int xa_pointer_tag(void *entry) { return (unsigned long)entry & 3UL; } /* * xa_mk_internal() - Create an internal entry. * @v: Value to turn into an internal entry. * * Internal entries are used for a number of purposes. Entries 0-255 are * used for sibling entries (only 0-62 are used by the current code). 256 * is used for the retry entry. 257 is used for the reserved / zero entry. * Negative internal entries are used to represent errnos. Node pointers * are also tagged as internal entries in some situations. * * Context: Any context. * Return: An XArray internal entry corresponding to this value. */ static inline void *xa_mk_internal(unsigned long v) { return (void *)((v << 2) | 2); } /* * xa_to_internal() - Extract the value from an internal entry. * @entry: XArray entry. * * Context: Any context. * Return: The value which was stored in the internal entry. */ static inline unsigned long xa_to_internal(const void *entry) { return (unsigned long)entry >> 2; } /* * xa_is_internal() - Is the entry an internal entry? * @entry: XArray entry. * * Context: Any context. * Return: %true if the entry is an internal entry. */ static inline bool xa_is_internal(const void *entry) { return ((unsigned long)entry & 3) == 2; } #define XA_ZERO_ENTRY xa_mk_internal(257) /** * xa_is_zero() - Is the entry a zero entry? * @entry: Entry retrieved from the XArray * * The normal API will return NULL as the contents of a slot containing * a zero entry. You can only see zero entries by using the advanced API. * * Return: %true if the entry is a zero entry. */ static inline bool xa_is_zero(const void *entry) { return unlikely(entry == XA_ZERO_ENTRY); } /** * xa_is_err() - Report whether an XArray operation returned an error * @entry: Result from calling an XArray function * * If an XArray operation cannot complete an operation, it will return * a special value indicating an error. This function tells you * whether an error occurred; xa_err() tells you which error occurred. * * Context: Any context. * Return: %true if the entry indicates an error. */ static inline bool xa_is_err(const void *entry) { return unlikely(xa_is_internal(entry) && entry >= xa_mk_internal(-MAX_ERRNO)); } /** * xa_err() - Turn an XArray result into an errno. * @entry: Result from calling an XArray function. * * If an XArray operation cannot complete an operation, it will return * a special pointer value which encodes an errno. This function extracts * the errno from the pointer value, or returns 0 if the pointer does not * represent an errno. * * Context: Any context. * Return: A negative errno or 0. */ static inline int xa_err(void *entry) { /* xa_to_internal() would not do sign extension. */ if (xa_is_err(entry)) return (long)entry >> 2; return 0; } /** * struct xa_limit - Represents a range of IDs. * @min: The lowest ID to allocate (inclusive). * @max: The maximum ID to allocate (inclusive). * * This structure is used either directly or via the XA_LIMIT() macro * to communicate the range of IDs that are valid for allocation. * Three common ranges are predefined for you: * * xa_limit_32b - [0 - UINT_MAX] * * xa_limit_31b - [0 - INT_MAX] * * xa_limit_16b - [0 - USHRT_MAX] */ struct xa_limit { u32 max; u32 min; }; #define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max } #define xa_limit_32b XA_LIMIT(0, UINT_MAX) #define xa_limit_31b XA_LIMIT(0, INT_MAX) #define xa_limit_16b XA_LIMIT(0, USHRT_MAX) typedef unsigned __bitwise xa_mark_t; #define XA_MARK_0 ((__force xa_mark_t)0U) #define XA_MARK_1 ((__force xa_mark_t)1U) #define XA_MARK_2 ((__force xa_mark_t)2U) #define XA_PRESENT ((__force xa_mark_t)8U) #define XA_MARK_MAX XA_MARK_2 #define XA_FREE_MARK XA_MARK_0 enum xa_lock_type { XA_LOCK_IRQ = 1, XA_LOCK_BH = 2, }; /* * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, * and we remain compatible with that. */ #define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) #define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) #define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) #define XA_FLAGS_ZERO_BUSY ((__force gfp_t)8U) #define XA_FLAGS_ALLOC_WRAPPED ((__force gfp_t)16U) #define XA_FLAGS_ACCOUNT ((__force gfp_t)32U) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) /* ALLOC is for a normal 0-based alloc. ALLOC1 is for an 1-based alloc */ #define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK)) #define XA_FLAGS_ALLOC1 (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY) /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. * * To use the xarray, define it statically or embed it in your data structure. * It is a very small data structure, so it does not usually make sense to * allocate it separately and keep a pointer to it in your data structure. * * You may use the xa_lock to protect your own data structures as well. */ /* * If all of the entries in the array are NULL, @xa_head is a NULL pointer. * If the only non-NULL entry in the array is at index 0, @xa_head is that * entry. If any other entry in the array is non-NULL, @xa_head points * to an @xa_node. */ struct xarray { spinlock_t xa_lock; /* private: The rest of the data structure is not to be used directly. */ gfp_t xa_flags; void __rcu * xa_head; }; #define XARRAY_INIT(name, flags) { \ .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .xa_flags = flags, \ .xa_head = NULL, \ } /** * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags. * @name: A string that names your XArray. * @flags: XA_FLAG values. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name and flags. It is * equivalent to calling xa_init_flags() on the array, but it does the * initialisation at compiletime instead of runtime. */ #define DEFINE_XARRAY_FLAGS(name, flags) \ struct xarray name = XARRAY_INIT(name, flags) /** * DEFINE_XARRAY() - Define an XArray. * @name: A string that names your XArray. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name. It is equivalent * to calling xa_init() on the array, but it does the initialisation at * compiletime instead of runtime. */ #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) /** * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC) /** * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1) void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *xa_erase(struct xarray *, unsigned long index); void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); void *xa_find(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); void *xa_find_after(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t); void xa_destroy(struct xarray *); /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. * @flags: XA_FLAG values. * * If you need to initialise an XArray with special flags (eg you need * to take the lock from interrupt context), use this function instead * of xa_init(). * * Context: Any context. */ static inline void xa_init_flags(struct xarray *xa, gfp_t flags) { spin_lock_init(&xa->xa_lock); xa->xa_flags = flags; xa->xa_head = NULL; } /** * xa_init() - Initialise an empty XArray. * @xa: XArray. * * An empty XArray is full of NULL entries. * * Context: Any context. */ static inline void xa_init(struct xarray *xa) { xa_init_flags(xa, 0); } /** * xa_empty() - Determine if an array has any present entries. * @xa: XArray. * * Context: Any context. * Return: %true if the array contains only NULL pointers. */ static inline bool xa_empty(const struct xarray *xa) { return xa->xa_head == NULL; } /** * xa_marked() - Inquire whether any entry in this array has a mark set * @xa: Array * @mark: Mark value * * Context: Any context. * Return: %true if any entry has this mark set. */ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) { return xa->xa_flags & XA_FLAGS_MARK(mark); } /** * xa_for_each_range() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * @last: Last index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_range() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_range(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_range(xa, index, entry, start, last) \ for (index = start, \ entry = xa_find(xa, &index, last, XA_PRESENT); \ entry; \ entry = xa_find_after(xa, &index, last, XA_PRESENT)) /** * xa_for_each_start() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_start() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_start(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_start(xa, index, entry, start) \ xa_for_each_range(xa, index, entry, start, ULONG_MAX) /** * xa_for_each() - Iterate over present entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you want * to skip or reprocess indices. It is safe to modify the array during the * iteration. At the end of the iteration, @entry will be set to NULL and * @index will have a value less than or equal to max. * * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). xa_for_each() * will spin if it hits a retry entry; if you intend to see retry entries, * you should use the xas_for_each() iterator instead. The xas_for_each() * iterator will expand into more inline code than xa_for_each(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each(xa, index, entry) \ xa_for_each_start(xa, index, entry, 0) /** * xa_for_each_marked() - Iterate over marked entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @filter: Selection criterion. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. The iteration will skip all entries in the array * which do not match @filter. You may modify @index during the iteration * if you want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set to * NULL and @index will have a value less than or equal to max. * * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n). * You have to handle your own locking with xas_for_each(), and if you have * to unlock after each iteration, it will also end up being O(n.log(n)). * xa_for_each_marked() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each_marked() iterator * instead. The xas_for_each_marked() iterator will expand into more inline * code than xa_for_each_marked(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \ entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter)) #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) #define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) #define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) #define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) #define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) #define xa_lock_irqsave(xa, flags) \ spin_lock_irqsave(&(xa)->xa_lock, flags) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) #define xa_lock_nested(xa, subclass) \ spin_lock_nested(&(xa)->xa_lock, subclass) #define xa_lock_bh_nested(xa, subclass) \ spin_lock_bh_nested(&(xa)->xa_lock, subclass) #define xa_lock_irq_nested(xa, subclass) \ spin_lock_irq_nested(&(xa)->xa_lock, subclass) #define xa_lock_irqsave_nested(xa, flags, subclass) \ spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass) /* * Versions of the normal API which require the caller to hold the * xa_lock. If the GFP flags allow it, they will drop the lock to * allocate memory, then reacquire it afterwards. These functions * may also re-enable interrupts if the XArray flags indicate the * locking should be interrupt safe. */ void *__xa_erase(struct xarray *, unsigned long index); void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); int __must_check __xa_insert(struct xarray *, unsigned long index, void *entry, gfp_t); int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry, struct xa_limit, gfp_t); int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry, struct xa_limit, u32 *next, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); /** * xa_store_bh() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_store_irq() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_erase_bh() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The entry which used to be at this index. */ static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) { void *entry; xa_lock_bh(xa); entry = __xa_erase(xa, index); xa_unlock_bh(xa); return entry; } /** * xa_erase_irq() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The entry which used to be at this index. */ static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) { void *entry; xa_lock_irq(xa); entry = __xa_erase(xa, index); xa_unlock_irq(xa); return entry; } /** * xa_cmpxchg() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * If the entry at @index is the same as @old, replace it with @entry. * If the return value is equal to @old, then the exchange was successful. * * Context: Any context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock(xa); return curr; } /** * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_bh(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; might_alloc(gfp); xa_lock_irq(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_insert() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock(xa); return err; } /** * xa_insert_bh() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_bh(xa); return err; } /** * xa_insert_irq() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock(xa); return err; } /** * xa_alloc_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); return err; } /** * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_bh(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set * in xa_init_flags(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; might_alloc(gfp); xa_lock_irq(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); return err; } /** * xa_reserve() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * Ensures there is somewhere to store an entry at @index in the array. * If there is already something stored at @index, this function does * nothing. If there was nothing there, the entry is marked as reserved. * Loading from a reserved entry returns a %NULL pointer. * * If you do not use the entry that you have reserved, call xa_release() * or xa_erase() to free any unnecessary memory. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_bh() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * A softirq-disabling version of xa_reserve(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_irq() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * An interrupt-disabling version of xa_reserve(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_release() - Release a reserved entry. * @xa: XArray. * @index: Index of entry. * * After calling xa_reserve(), you can call this function to release the * reservation. If the entry at @index has been stored to, this function * will do nothing. */ static inline void xa_release(struct xarray *xa, unsigned long index) { xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0); } /* Everything below here is the Advanced API. Proceed with caution. */ /* * The xarray is constructed out of a set of 'chunks' of pointers. Choosing * the best chunk size requires some tradeoffs. A power of two recommends * itself so that we can walk the tree based purely on shifts and masks. * Generally, the larger the better; as the number of slots per level of the * tree increases, the less tall the tree needs to be. But that needs to be * balanced against the memory consumption of each node. On a 64-bit system, * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. */ #ifndef XA_CHUNK_SHIFT #define XA_CHUNK_SHIFT (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6) #endif #define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) #define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) #define XA_MAX_MARKS 3 #define XA_MARK_LONGS BITS_TO_LONGS(XA_CHUNK_SIZE) /* * @count is the count of every non-NULL element in the ->slots array * whether that is a value entry, a retry entry, a user pointer, * a sibling entry or a pointer to the next level of the tree. * @nr_values is the count of every element in ->slots which is * either a value entry or a sibling of a value entry. */ struct xa_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char nr_values; /* Value entry count */ struct xa_node __rcu *parent; /* NULL at top of tree */ struct xarray *array; /* The array we belong to */ union { struct list_head private_list; /* For tree user */ struct rcu_head rcu_head; /* Used when freeing node */ }; void __rcu *slots[XA_CHUNK_SIZE]; union { unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; }; }; void xa_dump(const struct xarray *); void xa_dump_node(const struct xa_node *); #ifdef XA_DEBUG #define XA_BUG_ON(xa, x) do { \ if (x) { \ xa_dump(xa); \ BUG(); \ } \ } while (0) #define XA_NODE_BUG_ON(node, x) do { \ if (x) { \ if (node) xa_dump_node(node); \ BUG(); \ } \ } while (0) #else #define XA_BUG_ON(xa, x) do { } while (0) #define XA_NODE_BUG_ON(node, x) do { } while (0) #endif /* Private */ static inline void *xa_head(const struct xarray *xa) { return rcu_dereference_check(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_head_locked(const struct xarray *xa) { return rcu_dereference_protected(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_check(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry_locked(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_protected(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_check(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent_locked(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_protected(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_mk_node(const struct xa_node *node) { return (void *)((unsigned long)node | 2); } /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { return (struct xa_node *)((unsigned long)entry - 2); } /* Private */ static inline bool xa_is_node(const void *entry) { return xa_is_internal(entry) && (unsigned long)entry > 4096; } /* Private */ static inline void *xa_mk_sibling(unsigned int offset) { return xa_mk_internal(offset); } /* Private */ static inline unsigned long xa_to_sibling(const void *entry) { return xa_to_internal(entry); } /** * xa_is_sibling() - Is the entry a sibling entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a sibling entry. */ static inline bool xa_is_sibling(const void *entry) { return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); } #define XA_RETRY_ENTRY xa_mk_internal(256) /** * xa_is_retry() - Is the entry a retry entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a retry entry. */ static inline bool xa_is_retry(const void *entry) { return unlikely(entry == XA_RETRY_ENTRY); } /** * xa_is_advanced() - Is the entry only permitted for the advanced API? * @entry: Entry to be stored in the XArray. * * Return: %true if the entry cannot be stored by the normal API. */ static inline bool xa_is_advanced(const void *entry) { return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY); } /** * typedef xa_update_node_t - A callback function from the XArray. * @node: The node which is being processed * * This function is called every time the XArray updates the count of * present and value entries in a node. It allows advanced users to * maintain the private_list in the node. * * Context: The xa_lock is held and interrupts may be disabled. * Implementations should not drop the xa_lock, nor re-enable * interrupts. */ typedef void (*xa_update_node_t)(struct xa_node *node); void xa_delete_node(struct xa_node *, xa_update_node_t); /* * The xa_state is opaque to its users. It contains various different pieces * of state involved in the current operation on the XArray. It should be * declared on the stack and passed between the various internal routines. * The various elements in it should not be accessed directly, but only * through the provided accessor functions. The below documentation is for * the benefit of those working on the code, not for users of the XArray. * * @xa_node usually points to the xa_node containing the slot we're operating * on (and @xa_offset is the offset in the slots array). If there is a * single entry in the array at index 0, there are no allocated xa_nodes to * point to, and so we store %NULL in @xa_node. @xa_node is set to * the value %XAS_RESTART if the xa_state is not walked to the correct * position in the tree of nodes for this operation. If an error occurs * during an operation, it is set to an %XAS_ERROR value. If we run off the * end of the allocated nodes, it is set to %XAS_BOUNDS. */ struct xa_state { struct xarray *xa; unsigned long xa_index; unsigned char xa_shift; unsigned char xa_sibs; unsigned char xa_offset; unsigned char xa_pad; /* Helps gcc generate better code */ struct xa_node *xa_node; struct xa_node *xa_alloc; xa_update_node_t xa_update; struct list_lru *xa_lru; }; /* * We encode errnos in the xas->xa_node. If an error has happened, we need to * drop the lock to fix it, and once we've done so the xa_state is invalid. */ #define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) #define XAS_BOUNDS ((struct xa_node *)1UL) #define XAS_RESTART ((struct xa_node *)3UL) #define __XA_STATE(array, index, shift, sibs) { \ .xa = array, \ .xa_index = index, \ .xa_shift = shift, \ .xa_sibs = sibs, \ .xa_offset = 0, \ .xa_pad = 0, \ .xa_node = XAS_RESTART, \ .xa_alloc = NULL, \ .xa_update = NULL, \ .xa_lru = NULL, \ } /** * XA_STATE() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * * Declare and initialise an xa_state on the stack. */ #define XA_STATE(name, array, index) \ struct xa_state name = __XA_STATE(array, index, 0, 0) /** * XA_STATE_ORDER() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * @order: Order of entry. * * Declare and initialise an xa_state on the stack. This variant of * XA_STATE() allows you to specify the 'order' of the element you * want to operate on.` */ #define XA_STATE_ORDER(name, array, index, order) \ struct xa_state name = __XA_STATE(array, \ (index >> order) << order, \ order - (order % XA_CHUNK_SHIFT), \ (1U << (order % XA_CHUNK_SHIFT)) - 1) #define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) #define xas_trylock(xas) xa_trylock((xas)->xa) #define xas_lock(xas) xa_lock((xas)->xa) #define xas_unlock(xas) xa_unlock((xas)->xa) #define xas_lock_bh(xas) xa_lock_bh((xas)->xa) #define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) #define xas_lock_irq(xas) xa_lock_irq((xas)->xa) #define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) #define xas_lock_irqsave(xas, flags) \ xa_lock_irqsave((xas)->xa, flags) #define xas_unlock_irqrestore(xas, flags) \ xa_unlock_irqrestore((xas)->xa, flags) /** * xas_error() - Return an errno stored in the xa_state. * @xas: XArray operation state. * * Return: 0 if no error has been noted. A negative errno if one has. */ static inline int xas_error(const struct xa_state *xas) { return xa_err(xas->xa_node); } /** * xas_set_err() - Note an error in the xa_state. * @xas: XArray operation state. * @err: Negative error number. * * Only call this function with a negative @err; zero or positive errors * will probably not behave the way you think they should. If you want * to clear the error from an xa_state, use xas_reset(). */ static inline void xas_set_err(struct xa_state *xas, long err) { xas->xa_node = XA_ERROR(err); } /** * xas_invalid() - Is the xas in a retry or error state? * @xas: XArray operation state. * * Return: %true if the xas cannot be used for operations. */ static inline bool xas_invalid(const struct xa_state *xas) { return (unsigned long)xas->xa_node & 3; } /** * xas_valid() - Is the xas a valid cursor into the array? * @xas: XArray operation state. * * Return: %true if the xas can be used for operations. */ static inline bool xas_valid(const struct xa_state *xas) { return !xas_invalid(xas); } /** * xas_is_node() - Does the xas point to a node? * @xas: XArray operation state. * * Return: %true if the xas currently references a node. */ static inline bool xas_is_node(const struct xa_state *xas) { return xas_valid(xas) && xas->xa_node; } /* True if the pointer is something other than a node */ static inline bool xas_not_node(struct xa_node *node) { return ((unsigned long)node & 3) || !node; } /* True if the node represents RESTART or an error */ static inline bool xas_frozen(struct xa_node *node) { return (unsigned long)node & 2; } /* True if the node represents head-of-tree, RESTART or BOUNDS */ static inline bool xas_top(struct xa_node *node) { return node <= XAS_RESTART; } /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. * * Resets the error or walk state of the @xas so future walks of the * array will start from the root. Use this if you have dropped the * xarray lock and want to reuse the xa_state. * * Context: Any context. */ static inline void xas_reset(struct xa_state *xas) { xas->xa_node = XAS_RESTART; } /** * xas_retry() - Retry the operation if appropriate. * @xas: XArray operation state. * @entry: Entry from xarray. * * The advanced functions may sometimes return an internal entry, such as * a retry entry or a zero entry. This function sets up the @xas to restart * the walk from the head of the array if needed. * * Context: Any context. * Return: true if the operation needs to be retried. */ static inline bool xas_retry(struct xa_state *xas, const void *entry) { if (xa_is_zero(entry)) return true; if (!xa_is_retry(entry)) return false; xas_reset(xas); return true; } void *xas_load(struct xa_state *); void *xas_store(struct xa_state *, void *entry); void *xas_find(struct xa_state *, unsigned long max); void *xas_find_conflict(struct xa_state *); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); void xas_clear_mark(const struct xa_state *, xa_mark_t); void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); void xas_destroy(struct xa_state *); void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); int xas_get_order(struct xa_state *xas); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { return 0; } static inline int xas_get_order(struct xa_state *xas) { return 0; } static inline void xas_split(struct xa_state *xas, void *entry, unsigned int order) { xas_store(xas, entry); } static inline void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { } #endif /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. * * Use this function to check that a previously loaded entry still has * the same value. This is useful for the lockless pagecache lookup where * we walk the array with only the RCU lock to protect us, lock the page, * then check that the page hasn't moved since we looked it up. * * The caller guarantees that @xas is still valid. If it may be in an * error or restart state, call xas_load() instead. * * Return: The entry at this location in the xarray. */ static inline void *xas_reload(struct xa_state *xas) { struct xa_node *node = xas->xa_node; void *entry; char offset; if (!node) return xa_head(xas->xa); if (IS_ENABLED(CONFIG_XARRAY_MULTI)) { offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK; entry = xa_entry(xas->xa, node, offset); if (!xa_is_sibling(entry)) return entry; offset = xa_to_sibling(entry); } else { offset = xas->xa_offset; } return xa_entry(xas->xa, node, offset); } /** * xas_set() - Set up XArray operation state for a different index. * @xas: XArray operation state. * @index: New index into the XArray. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see xas_next() * to move to an adjacent index. */ static inline void xas_set(struct xa_state *xas, unsigned long index) { xas->xa_index = index; xas->xa_node = XAS_RESTART; } /** * xas_advance() - Skip over sibling entries. * @xas: XArray operation state. * @index: Index of last sibling entry. * * Move the operation state to refer to the last sibling entry. * This is useful for loops that normally want to see sibling * entries but sometimes want to skip them. Use xas_set() if you * want to move to an index which is not part of this entry. */ static inline void xas_advance(struct xa_state *xas, unsigned long index) { unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0; xas->xa_index = index; xas->xa_offset = (index >> shift) & XA_CHUNK_MASK; } /** * xas_set_order() - Set up XArray operation state for a multislot entry. * @xas: XArray operation state. * @index: Target of the operation. * @order: Entry occupies 2^@order indices. */ static inline void xas_set_order(struct xa_state *xas, unsigned long index, unsigned int order) { #ifdef CONFIG_XARRAY_MULTI xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; xas->xa_shift = order - (order % XA_CHUNK_SHIFT); xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; xas->xa_node = XAS_RESTART; #else BUG_ON(order > 0); xas_set(xas, index); #endif } /** * xas_set_update() - Set up XArray operation state for a callback. * @xas: XArray operation state. * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. * This is advanced functionality and is only needed by the page * cache and swap cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { xas->xa_update = update; } static inline void xas_set_lru(struct xa_state *xas, struct list_lru *lru) { xas->xa_lru = lru; } /** * xas_next_entry() - Advance iterator to next present entry. * @xas: XArray operation state. * @max: Highest index to return. * * xas_next_entry() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find(), and will call xas_find() * for all the hard cases. * * Return: The next present entry after the one currently referred to by @xas. */ static inline void *xas_next_entry(struct xa_state *xas, unsigned long max) { struct xa_node *node = xas->xa_node; void *entry; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK))) return xas_find(xas, max); do { if (unlikely(xas->xa_index >= max)) return xas_find(xas, max); if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) return xas_find(xas, max); entry = xa_entry(xas->xa, node, xas->xa_offset + 1); if (unlikely(xa_is_internal(entry))) return xas_find(xas, max); xas->xa_offset++; xas->xa_index++; } while (!entry); return entry; } /* Private */ static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance, xa_mark_t mark) { unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark]; unsigned int offset = xas->xa_offset; if (advance) offset++; if (XA_CHUNK_SIZE == BITS_PER_LONG) { if (offset < XA_CHUNK_SIZE) { unsigned long data = *addr & (~0UL << offset); if (data) return __ffs(data); } return XA_CHUNK_SIZE; } return find_next_bit(addr, XA_CHUNK_SIZE, offset); } /** * xas_next_marked() - Advance iterator to next marked entry. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark to search for. * * xas_next_marked() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find_marked(), and will call * xas_find_marked() for all the hard cases. * * Return: The next marked entry after the one currently referred to by @xas. */ static inline void *xas_next_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { struct xa_node *node = xas->xa_node; void *entry; unsigned int offset; if (unlikely(xas_not_node(node) || node->shift)) return xas_find_marked(xas, max, mark); offset = xas_find_chunk(xas, true, mark); xas->xa_offset = offset; xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset; if (xas->xa_index > max) return NULL; if (offset == XA_CHUNK_SIZE) return xas_find_marked(xas, max, mark); entry = xa_entry(xas->xa, node, offset); if (!entry) return xas_find_marked(xas, max, mark); return entry; } /* * If iterating while holding a lock, drop the lock and reschedule * every %XA_CHECK_SCHED loops. */ enum { XA_CHECK_SCHED = 4096, }; /** * xas_for_each() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * * The loop body will be executed for each entry present in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each(xas, entry, max) \ for (entry = xas_find(xas, max); entry; \ entry = xas_next_entry(xas, max)) /** * xas_for_each_marked() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * @mark: Mark to search for. * * The loop body will be executed for each marked entry in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each_marked(xas, entry, max, mark) \ for (entry = xas_find_marked(xas, max, mark); entry; \ entry = xas_next_marked(xas, max, mark)) /** * xas_for_each_conflict() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * * The loop body will be executed for each entry in the XArray that * lies within the range specified by @xas. If the loop terminates * normally, @entry will be %NULL. The user may break out of the loop, * which will leave @entry set to the conflicting entry. The caller * may also call xa_set_err() to exit the loop while setting an error * to record the reason. */ #define xas_for_each_conflict(xas, entry) \ while ((entry = xas_find_conflict(xas))) void *__xas_next(struct xa_state *); void *__xas_prev(struct xa_state *); /** * xas_prev() - Move iterator to previous index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * subtracted from the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index 0, this function wraps * around to %ULONG_MAX. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_prev(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == 0)) return __xas_prev(xas); xas->xa_index--; xas->xa_offset--; return xa_entry(xas->xa, node, xas->xa_offset); } /** * xas_next() - Move state to next index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * added to the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index %ULONG_MAX, this function wraps * around to 0. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_next(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == XA_CHUNK_MASK)) return __xas_next(xas); xas->xa_index++; xas->xa_offset++; return xa_entry(xas->xa, node, xas->xa_offset); } #endif /* _LINUX_XARRAY_H */
18 21 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 #ifndef LLC_H #define LLC_H /* * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rculist_nulls.h> #include <linux/hash.h> #include <linux/jhash.h> #include <linux/atomic.h> struct net_device; struct packet_type; struct sk_buff; struct llc_addr { unsigned char lsap; unsigned char mac[IFHWADDRLEN]; }; #define LLC_SAP_STATE_INACTIVE 1 #define LLC_SAP_STATE_ACTIVE 2 #define LLC_SK_DEV_HASH_BITS 6 #define LLC_SK_DEV_HASH_ENTRIES (1<<LLC_SK_DEV_HASH_BITS) #define LLC_SK_LADDR_HASH_BITS 6 #define LLC_SK_LADDR_HASH_ENTRIES (1<<LLC_SK_LADDR_HASH_BITS) /** * struct llc_sap - Defines the SAP component * * @station - station this sap belongs to * @state - sap state * @p_bit - only lowest-order bit used * @f_bit - only lowest-order bit used * @laddr - SAP value in this 'lsap' * @node - entry in station sap_list * @sk_list - LLC sockets this one manages */ struct llc_sap { unsigned char state; unsigned char p_bit; unsigned char f_bit; refcount_t refcnt; int (*rcv_func)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); struct llc_addr laddr; struct list_head node; spinlock_t sk_lock; int sk_count; struct hlist_nulls_head sk_laddr_hash[LLC_SK_LADDR_HASH_ENTRIES]; struct hlist_head sk_dev_hash[LLC_SK_DEV_HASH_ENTRIES]; struct rcu_head rcu; }; static inline struct hlist_head *llc_sk_dev_hash(struct llc_sap *sap, int ifindex) { u32 bucket = hash_32(ifindex, LLC_SK_DEV_HASH_BITS); return &sap->sk_dev_hash[bucket]; } static inline u32 llc_sk_laddr_hashfn(struct llc_sap *sap, const struct llc_addr *laddr) { return hash_32(jhash(laddr->mac, sizeof(laddr->mac), 0), LLC_SK_LADDR_HASH_BITS); } static inline struct hlist_nulls_head *llc_sk_laddr_hash(struct llc_sap *sap, const struct llc_addr *laddr) { return &sap->sk_laddr_hash[llc_sk_laddr_hashfn(sap, laddr)]; } #define LLC_DEST_INVALID 0 /* Invalid LLC PDU type */ #define LLC_DEST_SAP 1 /* Type 1 goes here */ #define LLC_DEST_CONN 2 /* Type 2 goes here */ extern struct list_head llc_sap_list; int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); int llc_mac_hdr_init(struct sk_buff *skb, const unsigned char *sa, const unsigned char *da); void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)); void llc_remove_pack(int type); void llc_set_station_handler(void (*handler)(struct sk_buff *skb)); struct llc_sap *llc_sap_open(unsigned char lsap, int (*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)); static inline void llc_sap_hold(struct llc_sap *sap) { refcount_inc(&sap->refcnt); } static inline bool llc_sap_hold_safe(struct llc_sap *sap) { return refcount_inc_not_zero(&sap->refcnt); } void llc_sap_close(struct llc_sap *sap); static inline void llc_sap_put(struct llc_sap *sap) { if (refcount_dec_and_test(&sap->refcnt)) llc_sap_close(sap); } struct llc_sap *llc_sap_find(unsigned char sap_value); int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, const unsigned char *dmac, unsigned char dsap); void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb); void llc_station_init(void); void llc_station_exit(void); #ifdef CONFIG_PROC_FS int llc_proc_init(void); void llc_proc_exit(void); #else #define llc_proc_init() (0) #define llc_proc_exit() do { } while(0) #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL int llc_sysctl_init(void); void llc_sysctl_exit(void); extern int sysctl_llc2_ack_timeout; extern int sysctl_llc2_busy_timeout; extern int sysctl_llc2_p_timeout; extern int sysctl_llc2_rej_timeout; #else #define llc_sysctl_init() (0) #define llc_sysctl_exit() do { } while(0) #endif /* CONFIG_SYSCTL */ #endif /* LLC_H */
1 3 13 70 29 23 29 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_GENERIC_NETLINK_H #define __NET_GENERIC_NETLINK_H #include <linux/net.h> #include <net/netlink.h> #include <net/net_namespace.h> #include <uapi/linux/genetlink.h> #define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN) /* Non-parallel generic netlink requests are serialized by a global lock. */ void genl_lock(void); void genl_unlock(void); #define MODULE_ALIAS_GENL_FAMILY(family) \ MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family) /* Binding to multicast group requires %CAP_NET_ADMIN */ #define GENL_MCAST_CAP_NET_ADMIN BIT(0) /* Binding to multicast group requires %CAP_SYS_ADMIN */ #define GENL_MCAST_CAP_SYS_ADMIN BIT(1) /** * struct genl_multicast_group - generic netlink multicast group * @name: name of the multicast group, names are per-family * @flags: GENL_MCAST_* flags */ struct genl_multicast_group { char name[GENL_NAMSIZ]; u8 flags; }; struct genl_split_ops; struct genl_info; /** * struct genl_family - generic netlink family * @hdrsize: length of user specific header in bytes * @name: name of family * @version: protocol version * @maxattr: maximum number of attributes supported * @policy: netlink policy * @netnsok: set to true if the family can handle network * namespaces and should be presented in all of them * @parallel_ops: operations can be called in parallel and aren't * synchronized by the core genetlink code * @pre_doit: called before an operation's doit callback, it may * do additional, common, filtering and return an error * @post_doit: called after an operation's doit callback, it may * undo operations done by pre_doit, for example release locks * @bind: called when family multicast group is added to a netlink socket * @unbind: called when family multicast group is removed from a netlink socket * @module: pointer to the owning module (set to THIS_MODULE) * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups * @resv_start_op: first operation for which reserved fields of the header * can be validated and policies are required (see below); * new families should leave this field at zero * @ops: the operations supported by this family * @n_ops: number of operations supported by this family * @small_ops: the small-struct operations supported by this family * @n_small_ops: number of small-struct operations supported by this family * @split_ops: the split do/dump form of operation definition * @n_split_ops: number of entries in @split_ops, not that with split do/dump * ops the number of entries is not the same as number of commands * @sock_priv_size: the size of per-socket private memory * @sock_priv_init: the per-socket private memory initializer * @sock_priv_destroy: the per-socket private memory destructor * * Attribute policies (the combination of @policy and @maxattr fields) * can be attached at the family level or at the operation level. * If both are present the per-operation policy takes precedence. * For operations before @resv_start_op lack of policy means that the core * will perform no attribute parsing or validation. For newer operations * if policy is not provided core will reject all TLV attributes. */ struct genl_family { unsigned int hdrsize; char name[GENL_NAMSIZ]; unsigned int version; unsigned int maxattr; u8 netnsok:1; u8 parallel_ops:1; u8 n_ops; u8 n_small_ops; u8 n_split_ops; u8 n_mcgrps; u8 resv_start_op; const struct nla_policy *policy; int (*pre_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); void (*post_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); int (*bind)(int mcgrp); void (*unbind)(int mcgrp); const struct genl_ops * ops; const struct genl_small_ops *small_ops; const struct genl_split_ops *split_ops; const struct genl_multicast_group *mcgrps; struct module *module; size_t sock_priv_size; void (*sock_priv_init)(void *priv); void (*sock_priv_destroy)(void *priv); /* private: internal use only */ /* protocol family identifier */ int id; /* starting number of multicast group IDs in this family */ unsigned int mcgrp_offset; /* list of per-socket privs */ struct xarray *sock_privs; }; /** * struct genl_info - receiving information * @snd_seq: sending sequence number * @snd_portid: netlink portid of sender * @family: generic netlink family * @nlhdr: netlink message header * @genlhdr: generic netlink message header * @attrs: netlink attributes * @_net: network namespace * @ctx: storage space for the use by the family * @user_ptr: user pointers (deprecated, use ctx instead) * @extack: extended ACK report struct */ struct genl_info { u32 snd_seq; u32 snd_portid; const struct genl_family *family; const struct nlmsghdr * nlhdr; struct genlmsghdr * genlhdr; struct nlattr ** attrs; possible_net_t _net; union { u8 ctx[NETLINK_CTX_SIZE]; void * user_ptr[2]; }; struct netlink_ext_ack *extack; }; static inline struct net *genl_info_net(const struct genl_info *info) { return read_pnet(&info->_net); } static inline void genl_info_net_set(struct genl_info *info, struct net *net) { write_pnet(&info->_net, net); } static inline void *genl_info_userhdr(const struct genl_info *info) { return (u8 *)info->genlhdr + GENL_HDRLEN; } #define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg) #define GENL_SET_ERR_MSG_FMT(info, msg, args...) \ NL_SET_ERR_MSG_FMT((info)->extack, msg, ##args) /* Report that a root attribute is missing */ #define GENL_REQ_ATTR_CHECK(info, attr) ({ \ const struct genl_info *__info = (info); \ \ NL_REQ_ATTR_CHECK(__info->extack, NULL, __info->attrs, (attr)); \ }) enum genl_validate_flags { GENL_DONT_VALIDATE_STRICT = BIT(0), GENL_DONT_VALIDATE_DUMP = BIT(1), GENL_DONT_VALIDATE_DUMP_STRICT = BIT(2), }; /** * struct genl_small_ops - generic netlink operations (small version) * @cmd: command identifier * @internal_flags: flags used by the family * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @validate: validation flags from enum genl_validate_flags * @doit: standard command callback * @dumpit: callback for dumpers * * This is a cut-down version of struct genl_ops for users who don't need * most of the ancillary infra and want to save space. */ struct genl_small_ops { int (*doit)(struct sk_buff *skb, struct genl_info *info); int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); u8 cmd; u8 internal_flags; u8 flags; u8 validate; }; /** * struct genl_ops - generic netlink operations * @cmd: command identifier * @internal_flags: flags used by the family * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @maxattr: maximum number of attributes supported * @policy: netlink policy (takes precedence over family policy) * @validate: validation flags from enum genl_validate_flags * @doit: standard command callback * @start: start callback for dumps * @dumpit: callback for dumpers * @done: completion callback for dumps */ struct genl_ops { int (*doit)(struct sk_buff *skb, struct genl_info *info); int (*start)(struct netlink_callback *cb); int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); const struct nla_policy *policy; unsigned int maxattr; u8 cmd; u8 internal_flags; u8 flags; u8 validate; }; /** * struct genl_split_ops - generic netlink operations (do/dump split version) * @cmd: command identifier * @internal_flags: flags used by the family * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @validate: validation flags from enum genl_validate_flags * @policy: netlink policy (takes precedence over family policy) * @maxattr: maximum number of attributes supported * * Do callbacks: * @pre_doit: called before an operation's @doit callback, it may * do additional, common, filtering and return an error * @doit: standard command callback * @post_doit: called after an operation's @doit callback, it may * undo operations done by pre_doit, for example release locks * * Dump callbacks: * @start: start callback for dumps * @dumpit: callback for dumpers * @done: completion callback for dumps * * Do callbacks can be used if %GENL_CMD_CAP_DO is set in @flags. * Dump callbacks can be used if %GENL_CMD_CAP_DUMP is set in @flags. * Exactly one of those flags must be set. */ struct genl_split_ops { union { struct { int (*pre_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); int (*doit)(struct sk_buff *skb, struct genl_info *info); void (*post_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); }; struct { int (*start)(struct netlink_callback *cb); int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); }; }; const struct nla_policy *policy; unsigned int maxattr; u8 cmd; u8 internal_flags; u8 flags; u8 validate; }; /** * struct genl_dumpit_info - info that is available during dumpit op call * @op: generic netlink ops - for internal genl code usage * @attrs: netlink attributes * @info: struct genl_info describing the request */ struct genl_dumpit_info { struct genl_split_ops op; struct genl_info info; }; static inline const struct genl_dumpit_info * genl_dumpit_info(struct netlink_callback *cb) { return cb->data; } static inline const struct genl_info * genl_info_dump(struct netlink_callback *cb) { return &genl_dumpit_info(cb)->info; } /** * genl_info_init_ntf() - initialize genl_info for notifications * @info: genl_info struct to set up * @family: pointer to the genetlink family * @cmd: command to be used in the notification * * Initialize a locally declared struct genl_info to pass to various APIs. * Intended to be used when creating notifications. */ static inline void genl_info_init_ntf(struct genl_info *info, const struct genl_family *family, u8 cmd) { struct genlmsghdr *hdr = (void *) &info->user_ptr[0]; memset(info, 0, sizeof(*info)); info->family = family; info->genlhdr = hdr; hdr->cmd = cmd; } static inline bool genl_info_is_ntf(const struct genl_info *info) { return !info->nlhdr; } void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk); void *genl_sk_priv_get(struct genl_family *family, struct sock *sk); int genl_register_family(struct genl_family *family); int genl_unregister_family(const struct genl_family *family); void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags); void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, const struct genl_family *family, int flags, u8 cmd); static inline void * __genlmsg_iput(struct sk_buff *skb, const struct genl_info *info, int flags) { return genlmsg_put(skb, info->snd_portid, info->snd_seq, info->family, flags, info->genlhdr->cmd); } /** * genlmsg_iput - start genetlink message based on genl_info * @skb: skb in which message header will be placed * @info: genl_info as provided to do/dump handlers * * Convenience wrapper which starts a genetlink message based on * information in user request. @info should be either the struct passed * by genetlink core to do/dump handlers (when constructing replies to * such requests) or a struct initialized by genl_info_init_ntf() * when constructing notifications. * * Returns pointer to new genetlink header. */ static inline void * genlmsg_iput(struct sk_buff *skb, const struct genl_info *info) { return __genlmsg_iput(skb, info, 0); } /** * genlmsg_nlhdr - Obtain netlink header from user specified header * @user_hdr: user header as returned from genlmsg_put() * * Returns pointer to netlink header. */ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) { return (struct nlmsghdr *)((char *)user_hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_parse_deprecated - parse attributes of a genetlink message * @nlh: netlink message header * @family: genetlink message family * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh, const struct genl_family *family, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * genlmsg_parse - parse attributes of a genetlink message * @nlh: netlink message header * @family: genetlink message family * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ static inline int genlmsg_parse(const struct nlmsghdr *nlh, const struct genl_family *family, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * genl_dump_check_consistent - check if sequence is consistent and advertise if not * @cb: netlink callback structure that stores the sequence number * @user_hdr: user header as returned from genlmsg_put() * * Cf. nl_dump_check_consistent(), this just provides a wrapper to make it * simpler to use with generic netlink. */ static inline void genl_dump_check_consistent(struct netlink_callback *cb, void *user_hdr) { nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr)); } /** * genlmsg_put_reply - Add generic netlink header to a reply message * @skb: socket buffer holding the message * @info: receiver info * @family: generic netlink family * @flags: netlink message flags * @cmd: generic netlink command * * Returns pointer to user specific header */ static inline void *genlmsg_put_reply(struct sk_buff *skb, struct genl_info *info, const struct genl_family *family, int flags, u8 cmd) { return genlmsg_put(skb, info->snd_portid, info->snd_seq, family, flags, cmd); } /** * genlmsg_end - Finalize a generic netlink message * @skb: socket buffer the message is stored in * @hdr: user specific header */ static inline void genlmsg_end(struct sk_buff *skb, void *hdr) { nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_cancel - Cancel construction of a generic netlink message * @skb: socket buffer the message is stored in * @hdr: generic netlink message header */ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) { if (hdr) nlmsg_cancel(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_multicast_netns_filtered - multicast a netlink message * to a specific netns with filter * function * @family: the generic netlink family * @net: the net namespace * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags * @filter: filter function * @filter_data: filter function private data * * Return: 0 on success, negative error code for failure. */ static inline int genlmsg_multicast_netns_filtered(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags, netlink_filter_fn filter, void *filter_data) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group, flags, filter, filter_data); } /** * genlmsg_multicast_netns - multicast a netlink message to a specific netns * @family: the generic netlink family * @net: the net namespace * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags */ static inline int genlmsg_multicast_netns(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { return genlmsg_multicast_netns_filtered(family, net, skb, portid, group, flags, NULL, NULL); } /** * genlmsg_multicast - multicast a netlink message to the default netns * @family: the generic netlink family * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags */ static inline int genlmsg_multicast(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { return genlmsg_multicast_netns(family, &init_net, skb, portid, group, flags); } /** * genlmsg_multicast_allns - multicast a netlink message to all net namespaces * @family: the generic netlink family * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * * This function must hold the RTNL or rcu_read_lock(). */ int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group); /** * genlmsg_unicast - unicast a netlink message * @net: network namespace to look up @portid in * @skb: netlink message as socket buffer * @portid: netlink portid of the destination socket */ static inline int genlmsg_unicast(struct net *net, struct sk_buff *skb, u32 portid) { return nlmsg_unicast(net->genl_sock, skb, portid); } /** * genlmsg_reply - reply to a request * @skb: netlink message to be sent back * @info: receiver information */ static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info) { return genlmsg_unicast(genl_info_net(info), skb, info->snd_portid); } /** * genlmsg_data - head of message payload * @gnlh: genetlink message header */ static inline void *genlmsg_data(const struct genlmsghdr *gnlh) { return ((unsigned char *) gnlh + GENL_HDRLEN); } /** * genlmsg_len - length of message payload * @gnlh: genetlink message header */ static inline int genlmsg_len(const struct genlmsghdr *gnlh) { struct nlmsghdr *nlh = (struct nlmsghdr *)((unsigned char *)gnlh - NLMSG_HDRLEN); return (nlh->nlmsg_len - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_msg_size - length of genetlink message not including padding * @payload: length of message payload */ static inline int genlmsg_msg_size(int payload) { return GENL_HDRLEN + payload; } /** * genlmsg_total_size - length of genetlink message including padding * @payload: length of message payload */ static inline int genlmsg_total_size(int payload) { return NLMSG_ALIGN(genlmsg_msg_size(payload)); } /** * genlmsg_new - Allocate a new generic netlink message * @payload: size of the message payload * @flags: the type of memory to allocate. */ static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags) { return nlmsg_new(genlmsg_total_size(payload), flags); } /** * genl_set_err - report error to genetlink broadcast listeners * @family: the generic netlink family * @net: the network namespace to report the error to * @portid: the PORTID of a process that we want to skip (if any) * @group: the broadcast group that will notice the error * (this is the offset of the multicast group in the groups array) * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the * NETLINK_RECV_NO_ENOBUFS socket option. */ static inline int genl_set_err(const struct genl_family *family, struct net *net, u32 portid, u32 group, int code) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return netlink_set_err(net->genl_sock, portid, group, code); } static inline int genl_has_listeners(const struct genl_family *family, struct net *net, unsigned int group) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return netlink_has_listeners(net->genl_sock, group); } #endif /* __NET_GENERIC_NETLINK_H */
3 3 151 151 2 39 40 212 213 29 29 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/rhashtable.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> static DEFINE_MUTEX(flowtable_lock); static LIST_HEAD(flowtables); static void flow_offload_fill_dir(struct flow_offload *flow, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple; ft->dir = dir; switch (ctt->src.l3num) { case NFPROTO_IPV4: ft->src_v4 = ctt->src.u3.in; ft->dst_v4 = ctt->dst.u3.in; break; case NFPROTO_IPV6: ft->src_v6 = ctt->src.u3.in6; ft->dst_v6 = ctt->dst.u3.in6; break; } ft->l3proto = ctt->src.l3num; ft->l4proto = ctt->dst.protonum; switch (ctt->dst.protonum) { case IPPROTO_TCP: case IPPROTO_UDP: ft->src_port = ctt->src.u.tcp.port; ft->dst_port = ctt->dst.u.tcp.port; break; } } struct flow_offload *flow_offload_alloc(struct nf_conn *ct) { struct flow_offload *flow; if (unlikely(nf_ct_is_dying(ct))) return NULL; flow = kzalloc(sizeof(*flow), GFP_ATOMIC); if (!flow) return NULL; refcount_inc(&ct->ct_general.use); flow->ct = ct; flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) __set_bit(NF_FLOW_SNAT, &flow->flags); if (ct->status & IPS_DST_NAT) __set_bit(NF_FLOW_DNAT, &flow->flags); return flow; } EXPORT_SYMBOL_GPL(flow_offload_alloc); static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) { if (flow_tuple->l3proto == NFPROTO_IPV6) return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache)); return 0; } static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route, enum flow_offload_tuple_dir dir) { struct dst_entry *dst = route->tuple[dir].dst; route->tuple[dir].dst = NULL; return dst; } static int flow_offload_fill_route(struct flow_offload *flow, struct nf_flow_route *route, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; struct dst_entry *dst = nft_route_dst_fetch(route, dir); int i, j = 0; switch (flow_tuple->l3proto) { case NFPROTO_IPV4: flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); break; case NFPROTO_IPV6: flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true); break; } flow_tuple->iifidx = route->tuple[dir].in.ifindex; for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) { flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id; flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto; if (route->tuple[dir].in.ingress_vlans & BIT(i)) flow_tuple->in_vlan_ingress |= BIT(j); j++; } flow_tuple->encap_num = route->tuple[dir].in.num_encaps; switch (route->tuple[dir].xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest, ETH_ALEN); memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, ETH_ALEN); flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; dst_release(dst); break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: flow_tuple->dst_cache = dst; flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); break; default: WARN_ON_ONCE(1); break; } flow_tuple->xmit_type = route->tuple[dir].xmit_type; return 0; } static void nft_flow_dst_release(struct flow_offload *flow, enum flow_offload_tuple_dir dir) { if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) dst_release(flow->tuplehash[dir].tuple.dst_cache); } void flow_offload_route_init(struct flow_offload *flow, struct nf_flow_route *route) { flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); flow->type = NF_FLOW_OFFLOAD_ROUTE; } EXPORT_SYMBOL_GPL(flow_offload_route_init); static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) { tcp->seen[0].td_maxwin = 0; tcp->seen[1].td_maxwin = 0; } static void flow_offload_fixup_ct(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); s32 timeout; if (l4num == IPPROTO_TCP) { struct nf_tcp_net *tn = nf_tcp_pernet(net); flow_offload_fixup_tcp(&ct->proto.tcp); timeout = tn->timeouts[ct->proto.tcp.state]; timeout -= tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { struct nf_udp_net *tn = nf_udp_pernet(net); enum udp_conntrack state = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? UDP_CT_REPLIED : UDP_CT_UNREPLIED; timeout = tn->timeouts[state]; timeout -= tn->offload_timeout; } else { return; } if (timeout < 0) timeout = 0; if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout); } static void flow_offload_route_release(struct flow_offload *flow) { nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY); } void flow_offload_free(struct flow_offload *flow) { switch (flow->type) { case NF_FLOW_OFFLOAD_ROUTE: flow_offload_route_release(flow); break; default: break; } nf_ct_put(flow->ct); kfree_rcu(flow, rcu_head); } EXPORT_SYMBOL_GPL(flow_offload_free); static u32 flow_offload_hash(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple *tuple = data; return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) { const struct flow_offload_tuple_rhash *tuplehash = data; return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed); } static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct flow_offload_tuple *tuple = arg->key; const struct flow_offload_tuple_rhash *x = ptr; if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash))) return 1; return 0; } static const struct rhashtable_params nf_flow_offload_rhash_params = { .head_offset = offsetof(struct flow_offload_tuple_rhash, node), .hashfn = flow_offload_hash, .obj_hashfn = flow_offload_hash_obj, .obj_cmpfn = flow_offload_hash_cmp, .automatic_shrinking = true, }; unsigned long flow_offload_get_timeout(struct flow_offload *flow) { unsigned long timeout = NF_FLOW_TIMEOUT; struct net *net = nf_ct_net(flow->ct); int l4num = nf_ct_protonum(flow->ct); if (l4num == IPPROTO_TCP) { struct nf_tcp_net *tn = nf_tcp_pernet(net); timeout = tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { struct nf_udp_net *tn = nf_udp_pernet(net); timeout = tn->offload_timeout; } return timeout; } int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { int err; flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, nf_flow_offload_rhash_params); if (err < 0) return err; err = rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[1].node, nf_flow_offload_rhash_params); if (err < 0) { rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[0].node, nf_flow_offload_rhash_params); return err; } nf_ct_offload_timeout(flow->ct); if (nf_flowtable_hw_offload(flow_table)) { __set_bit(NF_FLOW_HW, &flow->flags); nf_flow_offload_add(flow_table, flow); } return 0; } EXPORT_SYMBOL_GPL(flow_offload_add); void flow_offload_refresh(struct nf_flowtable *flow_table, struct flow_offload *flow, bool force) { u32 timeout; timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); if (force || timeout - READ_ONCE(flow->timeout) > HZ) WRITE_ONCE(flow->timeout, timeout); else return; if (likely(!nf_flowtable_hw_offload(flow_table))) return; nf_flow_offload_add(flow_table, flow); } EXPORT_SYMBOL_GPL(flow_offload_refresh); static inline bool nf_flow_has_expired(const struct flow_offload *flow) { return nf_flow_timeout_delta(flow->timeout) <= 0; } static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, nf_flow_offload_rhash_params); rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, nf_flow_offload_rhash_params); flow_offload_free(flow); } void flow_offload_teardown(struct flow_offload *flow) { clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); set_bit(NF_FLOW_TEARDOWN, &flow->flags); flow_offload_fixup_ct(flow->ct); } EXPORT_SYMBOL_GPL(flow_offload_teardown); struct flow_offload_tuple_rhash * flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple) { struct flow_offload_tuple_rhash *tuplehash; struct flow_offload *flow; int dir; tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, nf_flow_offload_rhash_params); if (!tuplehash) return NULL; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) return NULL; if (unlikely(nf_ct_is_dying(flow->ct))) return NULL; return tuplehash; } EXPORT_SYMBOL_GPL(flow_offload_lookup); static int nf_flow_table_iterate(struct nf_flowtable *flow_table, void (*iter)(struct nf_flowtable *flowtable, struct flow_offload *flow, void *data), void *data) { struct flow_offload_tuple_rhash *tuplehash; struct rhashtable_iter hti; struct flow_offload *flow; int err = 0; rhashtable_walk_enter(&flow_table->rhashtable, &hti); rhashtable_walk_start(&hti); while ((tuplehash = rhashtable_walk_next(&hti))) { if (IS_ERR(tuplehash)) { if (PTR_ERR(tuplehash) != -EAGAIN) { err = PTR_ERR(tuplehash); break; } continue; } if (tuplehash->tuple.dir) continue; flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); iter(flow_table, flow, data); } rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); return err; } static bool nf_flow_custom_gc(struct nf_flowtable *flow_table, const struct flow_offload *flow) { return flow_table->type->gc && flow_table->type->gc(flow); } static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || nf_flow_custom_gc(flow_table, flow)) flow_offload_teardown(flow); if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { if (test_bit(NF_FLOW_HW, &flow->flags)) { if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) nf_flow_offload_del(flow_table, flow); else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags)) flow_offload_del(flow_table, flow); } else { flow_offload_del(flow_table, flow); } } else if (test_bit(NF_FLOW_HW, &flow->flags)) { nf_flow_offload_stats(flow_table, flow); } } void nf_flow_table_gc_run(struct nf_flowtable *flow_table) { nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL); } static void nf_flow_offload_work_gc(struct work_struct *work) { struct nf_flowtable *flow_table; flow_table = container_of(work, struct nf_flowtable, gc_work.work); nf_flow_table_gc_run(flow_table); queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, __be16 port, __be16 new_port) { struct tcphdr *tcph; tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false); } static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, __be16 port, __be16 new_port) { struct udphdr *udph; udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace2(&udph->check, skb, port, new_port, false); if (!udph->check) udph->check = CSUM_MANGLED_0; } } static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, u8 protocol, __be16 port, __be16 new_port) { switch (protocol) { case IPPROTO_TCP: nf_flow_nat_port_tcp(skb, thoff, port, new_port); break; case IPPROTO_UDP: nf_flow_nat_port_udp(skb, thoff, port, new_port); break; } } void nf_flow_snat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, u8 protocol, enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; hdr = (void *)(skb_network_header(skb) + thoff); switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = hdr->source; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; hdr->source = new_port; break; case FLOW_OFFLOAD_DIR_REPLY: port = hdr->dest; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; hdr->dest = new_port; break; } nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_snat_port); void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, u8 protocol, enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; hdr = (void *)(skb_network_header(skb) + thoff); switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: port = hdr->dest; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port; hdr->dest = new_port; break; case FLOW_OFFLOAD_DIR_REPLY: port = hdr->source; new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; hdr->source = new_port; break; } nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_dnat_port); int nf_flow_table_init(struct nf_flowtable *flowtable) { int err; INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); flow_block_init(&flowtable->flow_block); init_rwsem(&flowtable->flow_block_lock); err = rhashtable_init(&flowtable->rhashtable, &nf_flow_offload_rhash_params); if (err < 0) return err; queue_delayed_work(system_power_efficient_wq, &flowtable->gc_work, HZ); mutex_lock(&flowtable_lock); list_add(&flowtable->list, &flowtables); mutex_unlock(&flowtable_lock); return 0; } EXPORT_SYMBOL_GPL(nf_flow_table_init); static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { struct net_device *dev = data; if (!dev) { flow_offload_teardown(flow); return; } if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && (flow->tuplehash[0].tuple.iifidx == dev->ifindex || flow->tuplehash[1].tuple.iifidx == dev->ifindex)) flow_offload_teardown(flow); } void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable, struct net_device *dev) { nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); nf_flow_table_offload_flush(flowtable); } void nf_flow_table_cleanup(struct net_device *dev) { struct nf_flowtable *flowtable; mutex_lock(&flowtable_lock); list_for_each_entry(flowtable, &flowtables, list) nf_flow_table_gc_cleanup(flowtable, dev); mutex_unlock(&flowtable_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); void nf_flow_table_free(struct nf_flowtable *flow_table) { mutex_lock(&flowtable_lock); list_del(&flow_table->list); mutex_unlock(&flowtable_lock); cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_offload_flush(flow_table); /* ... no more pending work after this stage ... */ nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); nf_flow_table_gc_run(flow_table); nf_flow_table_offload_flush_cleanup(flow_table); rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); static int nf_flow_table_init_net(struct net *net) { net->ft.stat = alloc_percpu(struct nf_flow_table_stat); return net->ft.stat ? 0 : -ENOMEM; } static void nf_flow_table_fini_net(struct net *net) { free_percpu(net->ft.stat); } static int nf_flow_table_pernet_init(struct net *net) { int ret; ret = nf_flow_table_init_net(net); if (ret < 0) return ret; ret = nf_flow_table_init_proc(net); if (ret < 0) goto out_proc; return 0; out_proc: nf_flow_table_fini_net(net); return ret; } static void nf_flow_table_pernet_exit(struct list_head *net_exit_list) { struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { nf_flow_table_fini_proc(net); nf_flow_table_fini_net(net); } } static struct pernet_operations nf_flow_table_net_ops = { .init = nf_flow_table_pernet_init, .exit_batch = nf_flow_table_pernet_exit, }; static int __init nf_flow_table_module_init(void) { int ret; ret = register_pernet_subsys(&nf_flow_table_net_ops); if (ret < 0) return ret; ret = nf_flow_table_offload_init(); if (ret) goto out_offload; ret = nf_flow_register_bpf(); if (ret) goto out_bpf; return 0; out_bpf: nf_flow_table_offload_exit(); out_offload: unregister_pernet_subsys(&nf_flow_table_net_ops); return ret; } static void __exit nf_flow_table_module_exit(void) { nf_flow_table_offload_exit(); unregister_pernet_subsys(&nf_flow_table_net_ops); } module_init(nf_flow_table_module_init); module_exit(nf_flow_table_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("Netfilter flow table module");
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match running CPU */ /* * Might be used to distribute connections on several daemons, if * RPS (Remote Packet Steering) is enabled or NIC is multiqueue capable, * each RX queue IRQ affined to one CPU (1:1 mapping) */ /* (C) 2010 Eric Dumazet */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter/xt_cpu.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>"); MODULE_DESCRIPTION("Xtables: CPU match"); MODULE_ALIAS("ipt_cpu"); MODULE_ALIAS("ip6t_cpu"); static int cpu_mt_check(const struct xt_mtchk_param *par) { const struct xt_cpu_info *info = par->matchinfo; if (info->invert & ~1) return -EINVAL; return 0; } static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cpu_info *info = par->matchinfo; return (info->cpu == smp_processor_id()) ^ info->invert; } static struct xt_match cpu_mt_reg __read_mostly = { .name = "cpu", .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = cpu_mt_check, .match = cpu_mt, .matchsize = sizeof(struct xt_cpu_info), .me = THIS_MODULE, }; static int __init cpu_mt_init(void) { return xt_register_match(&cpu_mt_reg); } static void __exit cpu_mt_exit(void) { xt_unregister_match(&cpu_mt_reg); } module_init(cpu_mt_init); module_exit(cpu_mt_exit);
5 7 2 2 2 2 67 67 2 2 2 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 // SPDX-License-Identifier: GPL-2.0-only /* Expectation handling for nf_conntrack. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> * (c) 2005-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/netfilter.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/percpu.h> #include <linux/kernel.h> #include <linux/siphash.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/hash.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); struct hlist_head *nf_ct_expect_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hash); unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; static siphash_aligned_key_t nf_ct_expect_hashrnd; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); struct nf_conntrack_net *cnet; WARN_ON(!master_help); WARN_ON(timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); cnet = nf_ct_pernet(net); cnet->expect_count--; hlist_del_rcu(&exp->lnode); master_help->expecting[exp->class]--; nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); } EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); static void nf_ct_expectation_timed_out(struct timer_list *t) { struct nf_conntrack_expect *exp = from_timer(exp, t, timeout); spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_put(exp); } static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { struct { union nf_inet_addr dst_addr; u32 net_mix; u16 dport; u8 l3num; u8 protonum; } __aligned(SIPHASH_ALIGNMENT) combined; u32 hash; get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); memset(&combined, 0, sizeof(combined)); combined.dst_addr = tuple->dst.u3; combined.net_mix = net_hash_mix(n); combined.dport = (__force __u16)tuple->dst.u.all; combined.l3num = tuple->src.l3num; combined.protonum = tuple->dst.protonum; hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd); return reciprocal_scale(hash, nf_ct_expect_hsize); } static bool nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_expect *i, const struct nf_conntrack_zone *zone, const struct net *net) { return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && net_eq(net, nf_ct_net(i->master)) && nf_ct_zone_equal_any(i->master, zone); } bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); return true; } return false; } EXPORT_SYMBOL_GPL(nf_ct_remove_expect); struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i; unsigned int h; if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) { if (nf_ct_exp_equal(tuple, i, zone, net)) return i; } return NULL; } EXPORT_SYMBOL_GPL(__nf_ct_expect_find); /* Just find a expectation corresponding to a tuple. */ struct nf_conntrack_expect * nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; rcu_read_lock(); i = __nf_ct_expect_find(net, zone, tuple); if (i && !refcount_inc_not_zero(&i->use)) i = NULL; rcu_read_unlock(); return i; } EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); /* If an expectation for this connection is found, it gets delete from * global list then returned. */ struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, bool unlink) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; unsigned int h; if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_exp_equal(tuple, i, zone, net)) { exp = i; break; } } if (!exp) return NULL; /* If master is not in hash table yet (ie. packet hasn't left this machine yet), how can other end know about expected? Hence these are not the droids you are looking for (if master ct never got confirmed, we'd hold a reference to it and weird things would happen to future packets). */ if (!nf_ct_is_confirmed(exp->master)) return NULL; /* Avoid race with other CPUs, that for exp->master ct, is * about to invoke ->destroy(), or nf_ct_delete() via timeout * or early_drop(). * * The refcount_inc_not_zero() check tells: If that fails, we * know that the ct is being destroyed. If it succeeds, we * can be sure the ct cannot disappear underneath. */ if (unlikely(nf_ct_is_dying(exp->master) || !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) { refcount_inc(&exp->use); return exp; } else if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); return exp; } /* Undo exp->master refcnt increase, if del_timer() failed */ nf_ct_put(exp->master); return NULL; } /* delete all expectations for this conntrack */ void nf_ct_remove_expectations(struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_expect *exp; struct hlist_node *next; /* Optimization: most connection never expect any others. */ if (!help) return; spin_lock_bh(&nf_conntrack_expect_lock); hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { nf_ct_remove_expect(exp); } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); /* Would two expected things clash? */ static inline int expect_clash(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { /* Part covered by intersection of masks must be unequal, otherwise they clash */ struct nf_conntrack_tuple_mask intersect_mask; int count; intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all; for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ intersect_mask.src.u3.all[count] = a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; } return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static inline int expect_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { return nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static bool master_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b, unsigned int flags) { if (flags & NF_CT_EXP_F_SKIP_MASTER) return true; return a->master == b->master; } /* Generally a bad idea to call this: could have matched already. */ void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) { spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_remove_expect(exp); spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); /* We don't increase the master conntrack refcount for non-fulfilled * conntracks. During the conntrack destruction, the expectations are * always killed before the conntrack itself */ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) { struct nf_conntrack_expect *new; new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC); if (!new) return NULL; new->master = me; refcount_set(&new->use, 1); return new; } EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, u_int8_t family, const union nf_inet_addr *saddr, const union nf_inet_addr *daddr, u_int8_t proto, const __be16 *src, const __be16 *dst) { int len; if (family == AF_INET) len = 4; else len = 16; exp->flags = 0; exp->class = class; exp->expectfn = NULL; exp->helper = NULL; exp->tuple.src.l3num = family; exp->tuple.dst.protonum = proto; if (saddr) { memcpy(&exp->tuple.src.u3, saddr, len); if (sizeof(exp->tuple.src.u3) > len) /* address needs to be cleared for nf_ct_tuple_equal */ memset((void *)&exp->tuple.src.u3 + len, 0x00, sizeof(exp->tuple.src.u3) - len); memset(&exp->mask.src.u3, 0xFF, len); if (sizeof(exp->mask.src.u3) > len) memset((void *)&exp->mask.src.u3 + len, 0x00, sizeof(exp->mask.src.u3) - len); } else { memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3)); memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3)); } if (src) { exp->tuple.src.u.all = *src; exp->mask.src.u.all = htons(0xFFFF); } else { exp->tuple.src.u.all = 0; exp->mask.src.u.all = 0; } memcpy(&exp->tuple.dst.u3, daddr, len); if (sizeof(exp->tuple.dst.u3) > len) /* address needs to be cleared for nf_ct_tuple_equal */ memset((void *)&exp->tuple.dst.u3 + len, 0x00, sizeof(exp->tuple.dst.u3) - len); exp->tuple.dst.u.all = *dst; #if IS_ENABLED(CONFIG_NF_NAT) memset(&exp->saved_addr, 0, sizeof(exp->saved_addr)); memset(&exp->saved_proto, 0, sizeof(exp->saved_proto)); #endif } EXPORT_SYMBOL_GPL(nf_ct_expect_init); static void nf_ct_expect_free_rcu(struct rcu_head *head) { struct nf_conntrack_expect *exp; exp = container_of(head, struct nf_conntrack_expect, rcu); kmem_cache_free(nf_ct_expect_cachep, exp); } void nf_ct_expect_put(struct nf_conntrack_expect *exp) { if (refcount_dec_and_test(&exp->use)) call_rcu(&exp->rcu, nf_ct_expect_free_rcu); } EXPORT_SYMBOL_GPL(nf_ct_expect_put); static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { struct nf_conntrack_net *cnet; struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple); /* two references : one for hash insert, one for the timer */ refcount_add(2, &exp->use); timer_setup(&exp->timeout, nf_ct_expectation_timed_out, 0); helper = rcu_dereference_protected(master_help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { exp->timeout.expires = jiffies + helper->expect_policy[exp->class].timeout * HZ; } add_timer(&exp->timeout); hlist_add_head_rcu(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); cnet = nf_ct_pernet(net); cnet->expect_count++; NF_CT_STAT_INC(net, expect_create); } /* Race with expectations being used means we could have none to find; OK. */ static void evict_oldest_expect(struct nf_conn *master, struct nf_conntrack_expect *new) { struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_expect *exp, *last = NULL; hlist_for_each_entry(exp, &master_help->expectations, lnode) { if (exp->class == new->class) last = exp; } if (last) nf_ct_remove_expect(last); } static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, unsigned int flags) { const struct nf_conntrack_expect_policy *p; struct nf_conntrack_expect *i; struct nf_conntrack_net *cnet; struct nf_conn *master = expect->master; struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(expect); struct hlist_node *next; unsigned int h; int ret = 0; if (!master_help) { ret = -ESHUTDOWN; goto out; } h = nf_ct_expect_dst_hash(net, &expect->tuple); hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (master_matches(i, expect, flags) && expect_matches(i, expect)) { if (i->class != expect->class || i->master != expect->master) return -EALREADY; if (nf_ct_remove_expect(i)) break; } else if (expect_clash(i, expect)) { ret = -EBUSY; goto out; } } /* Will be over limit? */ helper = rcu_dereference_protected(master_help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { p = &helper->expect_policy[expect->class]; if (p->max_expected && master_help->expecting[expect->class] >= p->max_expected) { evict_oldest_expect(master, expect); if (master_help->expecting[expect->class] >= p->max_expected) { ret = -EMFILE; goto out; } } } cnet = nf_ct_pernet(net); if (cnet->expect_count >= nf_ct_expect_max) { net_warn_ratelimited("nf_conntrack: expectation table full\n"); ret = -EMFILE; } out: return ret; } int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, u32 portid, int report, unsigned int flags) { int ret; spin_lock_bh(&nf_conntrack_expect_lock); ret = __nf_ct_expect_check(expect, flags); if (ret < 0) goto out; nf_ct_expect_insert(expect); spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return 0; out: spin_unlock_bh(&nf_conntrack_expect_lock); return ret; } EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data) { struct nf_conntrack_expect *exp; const struct hlist_node *next; unsigned int i; spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { if (iter(exp, data) && del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } } } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_destroy); void nf_ct_expect_iterate_net(struct net *net, bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data, u32 portid, int report) { struct nf_conntrack_expect *exp; const struct hlist_node *next; unsigned int i; spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { if (!net_eq(nf_ct_exp_net(exp), net)) continue; if (iter(exp, data) && del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, portid, report); nf_ct_expect_put(exp); } } } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_net); #ifdef CONFIG_NF_CONNTRACK_PROCFS struct ct_expect_iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } return NULL; } static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos) { struct hlist_node *head = ct_expect_get_first(seq); if (head) while (pos && (head = ct_expect_get_next(seq, head))) pos--; return pos ? NULL : head; } static void *exp_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return ct_expect_get_idx(seq, *pos); } static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return ct_expect_get_next(seq, v); } static void exp_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int exp_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_expect *expect; struct nf_conntrack_helper *helper; struct hlist_node *n = v; char *delim = ""; expect = hlist_entry(n, struct nf_conntrack_expect, hnode); if (expect->timeout.function) seq_printf(s, "%ld ", timer_pending(&expect->timeout) ? (long)(expect->timeout.expires - jiffies)/HZ : 0); else seq_puts(s, "- "); seq_printf(s, "l3proto = %u proto=%u ", expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, nf_ct_l4proto_find(expect->tuple.dst.protonum)); if (expect->flags & NF_CT_EXPECT_PERMANENT) { seq_puts(s, "PERMANENT"); delim = ","; } if (expect->flags & NF_CT_EXPECT_INACTIVE) { seq_printf(s, "%sINACTIVE", delim); delim = ","; } if (expect->flags & NF_CT_EXPECT_USERSPACE) seq_printf(s, "%sUSERSPACE", delim); helper = rcu_dereference(nfct_help(expect->master)->helper); if (helper) { seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name); if (helper->expect_policy[expect->class].name[0]) seq_printf(s, "/%s", helper->expect_policy[expect->class].name); } seq_putc(s, '\n'); return 0; } static const struct seq_operations exp_seq_ops = { .start = exp_seq_start, .next = exp_seq_next, .stop = exp_seq_stop, .show = exp_seq_show }; #endif /* CONFIG_NF_CONNTRACK_PROCFS */ static int exp_proc_init(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS struct proc_dir_entry *proc; kuid_t root_uid; kgid_t root_gid; proc = proc_create_net("nf_conntrack_expect", 0440, net->proc_net, &exp_seq_ops, sizeof(struct ct_expect_iter_state)); if (!proc) return -ENOMEM; root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ return 0; } static void exp_proc_remove(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS remove_proc_entry("nf_conntrack_expect", net->proc_net); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ } module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { return exp_proc_init(net); } void nf_conntrack_expect_pernet_fini(struct net *net) { exp_proc_remove(net); } int nf_conntrack_expect_init(void) { if (!nf_ct_expect_hsize) { nf_ct_expect_hsize = nf_conntrack_htable_size / 256; if (!nf_ct_expect_hsize) nf_ct_expect_hsize = 1; } nf_ct_expect_max = nf_ct_expect_hsize * 4; nf_ct_expect_cachep = KMEM_CACHE(nf_conntrack_expect, 0); if (!nf_ct_expect_cachep) return -ENOMEM; nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); if (!nf_ct_expect_hash) { kmem_cache_destroy(nf_ct_expect_cachep); return -ENOMEM; } return 0; } void nf_conntrack_expect_fini(void) { rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); kvfree(nf_ct_expect_hash); }
1967 1967 1967 1967 1966 1961 1962 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 // SPDX-License-Identifier: GPL-2.0 /* * SHA1 routine optimized to do word accesses rather than byte accesses, * and to avoid unnecessary copies into the context array. * * This was based on the git SHA1 implementation. */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/string.h> #include <crypto/sha1.h> #include <linux/unaligned.h> /* * If you have 32 registers or more, the compiler can (and should) * try to change the array[] accesses into registers. However, on * machines with less than ~25 registers, that won't really work, * and at least gcc will make an unholy mess of it. * * So to avoid that mess which just slows things down, we force * the stores to memory to actually happen (we might be better off * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as * suggested by Artur Skawina - that will also make gcc unable to * try to do the silly "optimize away loads" part because it won't * see what the value will be). * * Ben Herrenschmidt reports that on PPC, the C version comes close * to the optimized asm with this (ie on PPC you don't want that * 'volatile', since there are lots of registers). * * On ARM we get the best code generation by forcing a full memory barrier * between each SHA_ROUND, otherwise gcc happily get wild with spilling and * the stack frame size simply explode and performance goes down the drain. */ #ifdef CONFIG_X86 #define setW(x, val) (*(volatile __u32 *)&W(x) = (val)) #elif defined(CONFIG_ARM) #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0) #else #define setW(x, val) (W(x) = (val)) #endif /* This "rolls" over the 512-bit array */ #define W(x) (array[(x)&15]) /* * Where do we get the source from? The first 16 iterations get it from * the input data, the next mix it from the 512-bit array. */ #define SHA_SRC(t) get_unaligned_be32((__u32 *)data + t) #define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1) #define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \ __u32 TEMP = input(t); setW(t, TEMP); \ E += TEMP + rol32(A,5) + (fn) + (constant); \ B = ror32(B, 2); \ TEMP = E; E = D; D = C; C = B; B = A; A = TEMP; } while (0) #define T_0_15(t, A, B, C, D, E) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) #define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) #define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E ) #define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E ) #define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6, A, B, C, D, E ) /** * sha1_transform - single block SHA1 transform (deprecated) * * @digest: 160 bit digest to update * @data: 512 bits of data to hash * @array: 16 words of workspace (see note) * * This function executes SHA-1's internal compression function. It updates the * 160-bit internal state (@digest) with a single 512-bit data block (@data). * * Don't use this function. SHA-1 is no longer considered secure. And even if * you do have to use SHA-1, this isn't the correct way to hash something with * SHA-1 as this doesn't handle padding and finalization. * * Note: If the hash is security sensitive, the caller should be sure * to clear the workspace. This is left to the caller to avoid * unnecessary clears between chained hashing operations. */ void sha1_transform(__u32 *digest, const char *data, __u32 *array) { __u32 A, B, C, D, E; unsigned int i = 0; A = digest[0]; B = digest[1]; C = digest[2]; D = digest[3]; E = digest[4]; /* Round 1 - iterations 0-16 take their input from 'data' */ for (; i < 16; ++i) T_0_15(i, A, B, C, D, E); /* Round 1 - tail. Input from 512-bit mixing array */ for (; i < 20; ++i) T_16_19(i, A, B, C, D, E); /* Round 2 */ for (; i < 40; ++i) T_20_39(i, A, B, C, D, E); /* Round 3 */ for (; i < 60; ++i) T_40_59(i, A, B, C, D, E); /* Round 4 */ for (; i < 80; ++i) T_60_79(i, A, B, C, D, E); digest[0] += A; digest[1] += B; digest[2] += C; digest[3] += D; digest[4] += E; } EXPORT_SYMBOL(sha1_transform); /** * sha1_init - initialize the vectors for a SHA1 digest * @buf: vector to initialize */ void sha1_init(__u32 *buf) { buf[0] = 0x67452301; buf[1] = 0xefcdab89; buf[2] = 0x98badcfe; buf[3] = 0x10325476; buf[4] = 0xc3d2e1f0; } EXPORT_SYMBOL(sha1_init); MODULE_DESCRIPTION("SHA-1 Algorithm"); MODULE_LICENSE("GPL");
11 11 4 2 2 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 // SPDX-License-Identifier: GPL-2.0-only /* * STP SAP demux * * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/mutex.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/llc.h> #include <linux/slab.h> #include <linux/module.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/stp.h> /* 01:80:c2:00:00:20 - 01:80:c2:00:00:2F */ #define GARP_ADDR_MIN 0x20 #define GARP_ADDR_MAX 0x2F #define GARP_ADDR_RANGE (GARP_ADDR_MAX - GARP_ADDR_MIN) static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly; static const struct stp_proto __rcu *stp_proto __read_mostly; static struct llc_sap *sap __read_mostly; static unsigned int sap_registered; static DEFINE_MUTEX(stp_proto_mutex); /* Called under rcu_read_lock from LLC */ static int stp_pdu_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct ethhdr *eh = eth_hdr(skb); const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); const struct stp_proto *proto; if (pdu->ssap != LLC_SAP_BSPAN || pdu->dsap != LLC_SAP_BSPAN || pdu->ctrl_1 != LLC_PDU_TYPE_U) goto err; if (eh->h_dest[5] >= GARP_ADDR_MIN && eh->h_dest[5] <= GARP_ADDR_MAX) { proto = rcu_dereference(garp_protos[eh->h_dest[5] - GARP_ADDR_MIN]); if (proto && !ether_addr_equal(eh->h_dest, proto->group_address)) goto err; } else proto = rcu_dereference(stp_proto); if (!proto) goto err; proto->rcv(proto, skb, dev); return 0; err: kfree_skb(skb); return 0; } int stp_proto_register(const struct stp_proto *proto) { int err = 0; mutex_lock(&stp_proto_mutex); if (sap_registered++ == 0) { sap = llc_sap_open(LLC_SAP_BSPAN, stp_pdu_rcv); if (!sap) { err = -ENOMEM; goto out; } } if (is_zero_ether_addr(proto->group_address)) rcu_assign_pointer(stp_proto, proto); else rcu_assign_pointer(garp_protos[proto->group_address[5] - GARP_ADDR_MIN], proto); out: mutex_unlock(&stp_proto_mutex); return err; } EXPORT_SYMBOL_GPL(stp_proto_register); void stp_proto_unregister(const struct stp_proto *proto) { mutex_lock(&stp_proto_mutex); if (is_zero_ether_addr(proto->group_address)) RCU_INIT_POINTER(stp_proto, NULL); else RCU_INIT_POINTER(garp_protos[proto->group_address[5] - GARP_ADDR_MIN], NULL); synchronize_rcu(); if (--sap_registered == 0) llc_sap_put(sap); mutex_unlock(&stp_proto_mutex); } EXPORT_SYMBOL_GPL(stp_proto_unregister); MODULE_DESCRIPTION("SAP demux for IEEE 802.1D Spanning Tree Protocol (STP)"); MODULE_LICENSE("GPL");
4 4 4 3 3 1 1 4 4 4 3 1 1 6 15 5 5 4 1 4 4 1 2 3 3 7 7 1 3 3 5 1 4 1 1 1 1 2 3 1 8 8 2 6 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 /* * Copyright (C) 2017 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/mutex.h> #include <linux/rtnetlink.h> #include <net/pkt_cls.h> #include "netdevsim.h" #define pr_vlog(env, fmt, ...) \ bpf_verifier_log_write(env, "[netdevsim] " fmt, ##__VA_ARGS__) struct nsim_bpf_bound_prog { struct nsim_dev *nsim_dev; struct bpf_prog *prog; struct dentry *ddir; const char *state; bool is_loaded; struct list_head l; }; #define NSIM_BPF_MAX_KEYS 2 struct nsim_bpf_bound_map { struct netdevsim *ns; struct bpf_offloaded_map *map; struct mutex mutex; struct nsim_map_entry { void *key; void *value; } entry[NSIM_BPF_MAX_KEYS]; struct list_head l; }; static int nsim_bpf_string_show(struct seq_file *file, void *data) { const char **str = file->private; if (*str) seq_printf(file, "%s\n", *str); return 0; } DEFINE_SHOW_ATTRIBUTE(nsim_bpf_string); static int nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn) { struct nsim_bpf_bound_prog *state; int ret = 0; state = env->prog->aux->offload->dev_priv; if (state->nsim_dev->bpf_bind_verifier_delay && !insn_idx) msleep(state->nsim_dev->bpf_bind_verifier_delay); if (insn_idx == env->prog->len - 1) { pr_vlog(env, "Hello from netdevsim!\n"); if (!state->nsim_dev->bpf_bind_verifier_accept) ret = -EOPNOTSUPP; } return ret; } static int nsim_bpf_finalize(struct bpf_verifier_env *env) { return 0; } static bool nsim_xdp_offload_active(struct netdevsim *ns) { return ns->xdp_hw.prog; } static void nsim_prog_set_loaded(struct bpf_prog *prog, bool loaded) { struct nsim_bpf_bound_prog *state; if (!prog || !bpf_prog_is_offloaded(prog->aux)) return; state = prog->aux->offload->dev_priv; state->is_loaded = loaded; } static int nsim_bpf_offload(struct netdevsim *ns, struct bpf_prog *prog, bool oldprog) { nsim_prog_set_loaded(ns->bpf_offloaded, false); WARN(!!ns->bpf_offloaded != oldprog, "bad offload state, expected offload %sto be active", oldprog ? "" : "not "); ns->bpf_offloaded = prog; ns->bpf_offloaded_id = prog ? prog->aux->id : 0; nsim_prog_set_loaded(prog, true); return 0; } int nsim_bpf_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) { struct tc_cls_bpf_offload *cls_bpf = type_data; struct bpf_prog *prog = cls_bpf->prog; struct netdevsim *ns = cb_priv; struct bpf_prog *oldprog; if (type != TC_SETUP_CLSBPF) { NSIM_EA(cls_bpf->common.extack, "only offload of BPF classifiers supported"); return -EOPNOTSUPP; } if (!tc_cls_can_offload_and_chain0(ns->netdev, &cls_bpf->common)) return -EOPNOTSUPP; if (cls_bpf->common.protocol != htons(ETH_P_ALL)) { NSIM_EA(cls_bpf->common.extack, "only ETH_P_ALL supported as filter protocol"); return -EOPNOTSUPP; } if (!ns->bpf_tc_accept) { NSIM_EA(cls_bpf->common.extack, "netdevsim configured to reject BPF TC offload"); return -EOPNOTSUPP; } /* Note: progs without skip_sw will probably not be dev bound */ if (prog && !prog->aux->offload && !ns->bpf_tc_non_bound_accept) { NSIM_EA(cls_bpf->common.extack, "netdevsim configured to reject unbound programs"); return -EOPNOTSUPP; } if (cls_bpf->command != TC_CLSBPF_OFFLOAD) return -EOPNOTSUPP; oldprog = cls_bpf->oldprog; /* Don't remove if oldprog doesn't match driver's state */ if (ns->bpf_offloaded != oldprog) { oldprog = NULL; if (!cls_bpf->prog) return 0; if (ns->bpf_offloaded) { NSIM_EA(cls_bpf->common.extack, "driver and netdev offload states mismatch"); return -EBUSY; } } return nsim_bpf_offload(ns, cls_bpf->prog, oldprog); } int nsim_bpf_disable_tc(struct netdevsim *ns) { if (ns->bpf_offloaded && !nsim_xdp_offload_active(ns)) return -EBUSY; return 0; } static int nsim_xdp_offload_prog(struct netdevsim *ns, struct netdev_bpf *bpf) { if (!nsim_xdp_offload_active(ns) && !bpf->prog) return 0; if (!nsim_xdp_offload_active(ns) && bpf->prog && ns->bpf_offloaded) { NSIM_EA(bpf->extack, "TC program is already loaded"); return -EBUSY; } return nsim_bpf_offload(ns, bpf->prog, nsim_xdp_offload_active(ns)); } static int nsim_xdp_set_prog(struct netdevsim *ns, struct netdev_bpf *bpf, struct xdp_attachment_info *xdp) { int err; if (bpf->command == XDP_SETUP_PROG && !ns->bpf_xdpdrv_accept) { NSIM_EA(bpf->extack, "driver XDP disabled in DebugFS"); return -EOPNOTSUPP; } if (bpf->command == XDP_SETUP_PROG_HW && !ns->bpf_xdpoffload_accept) { NSIM_EA(bpf->extack, "XDP offload disabled in DebugFS"); return -EOPNOTSUPP; } if (bpf->command == XDP_SETUP_PROG_HW) { err = nsim_xdp_offload_prog(ns, bpf); if (err) return err; } xdp_attachment_setup(xdp, bpf); return 0; } static int nsim_bpf_create_prog(struct nsim_dev *nsim_dev, struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state; char name[16]; int ret; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return -ENOMEM; state->nsim_dev = nsim_dev; state->prog = prog; state->state = "verify"; /* Program id is not populated yet when we create the state. */ sprintf(name, "%u", nsim_dev->prog_id_gen++); state->ddir = debugfs_create_dir(name, nsim_dev->ddir_bpf_bound_progs); if (IS_ERR(state->ddir)) { ret = PTR_ERR(state->ddir); kfree(state); return ret; } debugfs_create_u32("id", 0400, state->ddir, &prog->aux->id); debugfs_create_file("state", 0400, state->ddir, &state->state, &nsim_bpf_string_fops); debugfs_create_bool("loaded", 0400, state->ddir, &state->is_loaded); list_add_tail(&state->l, &nsim_dev->bpf_bound_progs); prog->aux->offload->dev_priv = state; return 0; } static int nsim_bpf_verifier_prep(struct bpf_prog *prog) { struct nsim_dev *nsim_dev = bpf_offload_dev_priv(prog->aux->offload->offdev); if (!nsim_dev->bpf_bind_accept) return -EOPNOTSUPP; return nsim_bpf_create_prog(nsim_dev, prog); } static int nsim_bpf_translate(struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state = prog->aux->offload->dev_priv; state->state = "xlated"; return 0; } static void nsim_bpf_destroy_prog(struct bpf_prog *prog) { struct nsim_bpf_bound_prog *state; state = prog->aux->offload->dev_priv; WARN(state->is_loaded, "offload state destroyed while program still bound"); debugfs_remove_recursive(state->ddir); list_del(&state->l); kfree(state); } static const struct bpf_prog_offload_ops nsim_bpf_dev_ops = { .insn_hook = nsim_bpf_verify_insn, .finalize = nsim_bpf_finalize, .prepare = nsim_bpf_verifier_prep, .translate = nsim_bpf_translate, .destroy = nsim_bpf_destroy_prog, }; static int nsim_setup_prog_checks(struct netdevsim *ns, struct netdev_bpf *bpf) { if (bpf->prog && bpf->prog->aux->offload) { NSIM_EA(bpf->extack, "attempt to load offloaded prog to drv"); return -EINVAL; } if (ns->netdev->mtu > NSIM_XDP_MAX_MTU) { NSIM_EA(bpf->extack, "MTU too large w/ XDP enabled"); return -EINVAL; } return 0; } static int nsim_setup_prog_hw_checks(struct netdevsim *ns, struct netdev_bpf *bpf) { struct nsim_bpf_bound_prog *state; if (!bpf->prog) return 0; if (!bpf_prog_is_offloaded(bpf->prog->aux)) { NSIM_EA(bpf->extack, "xdpoffload of non-bound program"); return -EINVAL; } state = bpf->prog->aux->offload->dev_priv; if (WARN_ON(strcmp(state->state, "xlated"))) { NSIM_EA(bpf->extack, "offloading program in bad state"); return -EINVAL; } return 0; } static bool nsim_map_key_match(struct bpf_map *map, struct nsim_map_entry *e, void *key) { return e->key && !memcmp(key, e->key, map->key_size); } static int nsim_map_key_find(struct bpf_offloaded_map *offmap, void *key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; unsigned int i; for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) if (nsim_map_key_match(&offmap->map, &nmap->entry[i], key)) return i; return -ENOENT; } static int nsim_map_alloc_elem(struct bpf_offloaded_map *offmap, unsigned int idx) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; nmap->entry[idx].key = kmalloc(offmap->map.key_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!nmap->entry[idx].key) return -ENOMEM; nmap->entry[idx].value = kmalloc(offmap->map.value_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!nmap->entry[idx].value) { kfree(nmap->entry[idx].key); nmap->entry[idx].key = NULL; return -ENOMEM; } return 0; } static int nsim_map_get_next_key(struct bpf_offloaded_map *offmap, void *key, void *next_key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx = -ENOENT; mutex_lock(&nmap->mutex); if (key) idx = nsim_map_key_find(offmap, key); if (idx == -ENOENT) idx = 0; else idx++; for (; idx < ARRAY_SIZE(nmap->entry); idx++) { if (nmap->entry[idx].key) { memcpy(next_key, nmap->entry[idx].key, offmap->map.key_size); break; } } mutex_unlock(&nmap->mutex); if (idx == ARRAY_SIZE(nmap->entry)) return -ENOENT; return 0; } static int nsim_map_lookup_elem(struct bpf_offloaded_map *offmap, void *key, void *value) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx >= 0) memcpy(value, nmap->entry[idx].value, offmap->map.value_size); mutex_unlock(&nmap->mutex); return idx < 0 ? idx : 0; } static int nsim_map_update_elem(struct bpf_offloaded_map *offmap, void *key, void *value, u64 flags) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx, err = 0; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx < 0 && flags == BPF_EXIST) { err = idx; goto exit_unlock; } if (idx >= 0 && flags == BPF_NOEXIST) { err = -EEXIST; goto exit_unlock; } if (idx < 0) { for (idx = 0; idx < ARRAY_SIZE(nmap->entry); idx++) if (!nmap->entry[idx].key) break; if (idx == ARRAY_SIZE(nmap->entry)) { err = -E2BIG; goto exit_unlock; } err = nsim_map_alloc_elem(offmap, idx); if (err) goto exit_unlock; } memcpy(nmap->entry[idx].key, key, offmap->map.key_size); memcpy(nmap->entry[idx].value, value, offmap->map.value_size); exit_unlock: mutex_unlock(&nmap->mutex); return err; } static int nsim_map_delete_elem(struct bpf_offloaded_map *offmap, void *key) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; int idx; if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) return -EINVAL; mutex_lock(&nmap->mutex); idx = nsim_map_key_find(offmap, key); if (idx >= 0) { kfree(nmap->entry[idx].key); kfree(nmap->entry[idx].value); memset(&nmap->entry[idx], 0, sizeof(nmap->entry[idx])); } mutex_unlock(&nmap->mutex); return idx < 0 ? idx : 0; } static const struct bpf_map_dev_ops nsim_bpf_map_ops = { .map_get_next_key = nsim_map_get_next_key, .map_lookup_elem = nsim_map_lookup_elem, .map_update_elem = nsim_map_update_elem, .map_delete_elem = nsim_map_delete_elem, }; static int nsim_bpf_map_alloc(struct netdevsim *ns, struct bpf_offloaded_map *offmap) { struct nsim_bpf_bound_map *nmap; int i, err; if (WARN_ON(offmap->map.map_type != BPF_MAP_TYPE_ARRAY && offmap->map.map_type != BPF_MAP_TYPE_HASH)) return -EINVAL; if (offmap->map.max_entries > NSIM_BPF_MAX_KEYS) return -ENOMEM; if (offmap->map.map_flags) return -EINVAL; nmap = kzalloc(sizeof(*nmap), GFP_KERNEL_ACCOUNT); if (!nmap) return -ENOMEM; offmap->dev_priv = nmap; nmap->ns = ns; nmap->map = offmap; mutex_init(&nmap->mutex); if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) { for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { u32 *key; err = nsim_map_alloc_elem(offmap, i); if (err) goto err_free; key = nmap->entry[i].key; *key = i; memset(nmap->entry[i].value, 0, offmap->map.value_size); } } offmap->dev_ops = &nsim_bpf_map_ops; list_add_tail(&nmap->l, &ns->nsim_dev->bpf_bound_maps); return 0; err_free: while (--i >= 0) { kfree(nmap->entry[i].key); kfree(nmap->entry[i].value); } kfree(nmap); return err; } static void nsim_bpf_map_free(struct bpf_offloaded_map *offmap) { struct nsim_bpf_bound_map *nmap = offmap->dev_priv; unsigned int i; for (i = 0; i < ARRAY_SIZE(nmap->entry); i++) { kfree(nmap->entry[i].key); kfree(nmap->entry[i].value); } list_del_init(&nmap->l); mutex_destroy(&nmap->mutex); kfree(nmap); } int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf) { struct netdevsim *ns = netdev_priv(dev); int err; ASSERT_RTNL(); switch (bpf->command) { case XDP_SETUP_PROG: err = nsim_setup_prog_checks(ns, bpf); if (err) return err; return nsim_xdp_set_prog(ns, bpf, &ns->xdp); case XDP_SETUP_PROG_HW: err = nsim_setup_prog_hw_checks(ns, bpf); if (err) return err; return nsim_xdp_set_prog(ns, bpf, &ns->xdp_hw); case BPF_OFFLOAD_MAP_ALLOC: if (!ns->bpf_map_accept) return -EOPNOTSUPP; return nsim_bpf_map_alloc(ns, bpf->offmap); case BPF_OFFLOAD_MAP_FREE: nsim_bpf_map_free(bpf->offmap); return 0; default: return -EINVAL; } } int nsim_bpf_dev_init(struct nsim_dev *nsim_dev) { int err; INIT_LIST_HEAD(&nsim_dev->bpf_bound_progs); INIT_LIST_HEAD(&nsim_dev->bpf_bound_maps); nsim_dev->ddir_bpf_bound_progs = debugfs_create_dir("bpf_bound_progs", nsim_dev->ddir); if (IS_ERR(nsim_dev->ddir_bpf_bound_progs)) return PTR_ERR(nsim_dev->ddir_bpf_bound_progs); nsim_dev->bpf_dev = bpf_offload_dev_create(&nsim_bpf_dev_ops, nsim_dev); err = PTR_ERR_OR_ZERO(nsim_dev->bpf_dev); if (err) return err; nsim_dev->bpf_bind_accept = true; debugfs_create_bool("bpf_bind_accept", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_accept); debugfs_create_u32("bpf_bind_verifier_delay", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_verifier_delay); nsim_dev->bpf_bind_verifier_accept = true; debugfs_create_bool("bpf_bind_verifier_accept", 0600, nsim_dev->ddir, &nsim_dev->bpf_bind_verifier_accept); return 0; } void nsim_bpf_dev_exit(struct nsim_dev *nsim_dev) { WARN_ON(!list_empty(&nsim_dev->bpf_bound_progs)); WARN_ON(!list_empty(&nsim_dev->bpf_bound_maps)); bpf_offload_dev_destroy(nsim_dev->bpf_dev); } int nsim_bpf_init(struct netdevsim *ns) { struct dentry *ddir = ns->nsim_dev_port->ddir; int err; err = bpf_offload_dev_netdev_register(ns->nsim_dev->bpf_dev, ns->netdev); if (err) return err; debugfs_create_u32("bpf_offloaded_id", 0400, ddir, &ns->bpf_offloaded_id); ns->bpf_tc_accept = true; debugfs_create_bool("bpf_tc_accept", 0600, ddir, &ns->bpf_tc_accept); debugfs_create_bool("bpf_tc_non_bound_accept", 0600, ddir, &ns->bpf_tc_non_bound_accept); ns->bpf_xdpdrv_accept = true; debugfs_create_bool("bpf_xdpdrv_accept", 0600, ddir, &ns->bpf_xdpdrv_accept); ns->bpf_xdpoffload_accept = true; debugfs_create_bool("bpf_xdpoffload_accept", 0600, ddir, &ns->bpf_xdpoffload_accept); ns->bpf_map_accept = true; debugfs_create_bool("bpf_map_accept", 0600, ddir, &ns->bpf_map_accept); return 0; } void nsim_bpf_uninit(struct netdevsim *ns) { WARN_ON(ns->xdp.prog); WARN_ON(ns->xdp_hw.prog); WARN_ON(ns->bpf_offloaded); bpf_offload_dev_netdev_unregister(ns->nsim_dev->bpf_dev, ns->netdev); }
1 85 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __IPC_NAMESPACE_H__ #define __IPC_NAMESPACE_H__ #include <linux/err.h> #include <linux/idr.h> #include <linux/rwsem.h> #include <linux/notifier.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/refcount.h> #include <linux/rhashtable-types.h> #include <linux/sysctl.h> #include <linux/percpu_counter.h> struct user_namespace; struct ipc_ids { int in_use; unsigned short seq; struct rw_semaphore rwsem; struct idr ipcs_idr; int max_idx; int last_idx; /* For wrap around detection */ #ifdef CONFIG_CHECKPOINT_RESTORE int next_id; #endif struct rhashtable key_ht; }; struct ipc_namespace { struct ipc_ids ids[3]; int sem_ctls[4]; int used_sems; unsigned int msg_ctlmax; unsigned int msg_ctlmnb; unsigned int msg_ctlmni; struct percpu_counter percpu_msg_bytes; struct percpu_counter percpu_msg_hdrs; size_t shm_ctlmax; size_t shm_ctlall; unsigned long shm_tot; int shm_ctlmni; /* * Defines whether IPC_RMID is forced for _all_ shm segments regardless * of shmctl() */ int shm_rmid_forced; struct notifier_block ipcns_nb; /* The kern_mount of the mqueuefs sb. We take a ref on it */ struct vfsmount *mq_mnt; /* # queues in this ns, protected by mq_lock */ unsigned int mq_queues_count; /* next fields are set through sysctl */ unsigned int mq_queues_max; /* initialized to DFLT_QUEUESMAX */ unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */ unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */ unsigned int mq_msg_default; unsigned int mq_msgsize_default; struct ctl_table_set mq_set; struct ctl_table_header *mq_sysctls; struct ctl_table_set ipc_set; struct ctl_table_header *ipc_sysctls; /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; struct ucounts *ucounts; struct llist_node mnt_llist; struct ns_common ns; } __randomize_layout; extern struct ipc_namespace init_ipc_ns; extern spinlock_t mq_lock; #ifdef CONFIG_SYSVIPC extern void shm_destroy_orphaned(struct ipc_namespace *ns); #else /* CONFIG_SYSVIPC */ static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} #endif /* CONFIG_SYSVIPC */ #ifdef CONFIG_POSIX_MQUEUE extern int mq_init_ns(struct ipc_namespace *ns); /* * POSIX Message Queue default values: * * MIN_*: Lowest value an admin can set the maximum unprivileged limit to * DFLT_*MAX: Default values for the maximum unprivileged limits * DFLT_{MSG,MSGSIZE}: Default values used when the user doesn't supply * an attribute to the open call and the queue must be created * HARD_*: Highest value the maximums can be set to. These are enforced * on CAP_SYS_RESOURCE apps as well making them inviolate (so make them * suitably high) * * POSIX Requirements: * Per app minimum openable message queues - 8. This does not map well * to the fact that we limit the number of queues on a per namespace * basis instead of a per app basis. So, make the default high enough * that no given app should have a hard time opening 8 queues. * Minimum maximum for HARD_MSGMAX - 32767. I bumped this to 65536. * Minimum maximum for HARD_MSGSIZEMAX - POSIX is silent on this. However, * we have run into a situation where running applications in the wild * require this to be at least 5MB, and preferably 10MB, so I set the * value to 16MB in hopes that this user is the worst of the bunch and * the new maximum will handle anyone else. I may have to revisit this * in the future. */ #define DFLT_QUEUESMAX 256 #define MIN_MSGMAX 1 #define DFLT_MSG 10U #define DFLT_MSGMAX 10 #define HARD_MSGMAX 65536 #define MIN_MSGSIZEMAX 128 #define DFLT_MSGSIZE 8192U #define DFLT_MSGSIZEMAX 8192 #define HARD_MSGSIZEMAX (16*1024*1024) #else static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) extern struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns); static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) refcount_inc(&ns->ns.count); return ns; } static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) { if (ns) { if (refcount_inc_not_zero(&ns->ns.count)) return ns; } return NULL; } extern void put_ipc_ns(struct ipc_namespace *ns); #else static inline struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (flags & CLONE_NEWIPC) return ERR_PTR(-EINVAL); return ns; } static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { return ns; } static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) { return ns; } static inline void put_ipc_ns(struct ipc_namespace *ns) { } #endif #ifdef CONFIG_POSIX_MQUEUE_SYSCTL void retire_mq_sysctls(struct ipc_namespace *ns); bool setup_mq_sysctls(struct ipc_namespace *ns); #else /* CONFIG_POSIX_MQUEUE_SYSCTL */ static inline void retire_mq_sysctls(struct ipc_namespace *ns) { } static inline bool setup_mq_sysctls(struct ipc_namespace *ns) { return true; } #endif /* CONFIG_POSIX_MQUEUE_SYSCTL */ #ifdef CONFIG_SYSVIPC_SYSCTL bool setup_ipc_sysctls(struct ipc_namespace *ns); void retire_ipc_sysctls(struct ipc_namespace *ns); #else /* CONFIG_SYSVIPC_SYSCTL */ static inline void retire_ipc_sysctls(struct ipc_namespace *ns) { } static inline bool setup_ipc_sysctls(struct ipc_namespace *ns) { return true; } #endif /* CONFIG_SYSVIPC_SYSCTL */ #endif
19 152 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef LINUX_RESUME_USER_MODE_H #define LINUX_RESUME_USER_MODE_H #include <linux/sched.h> #include <linux/task_work.h> #include <linux/memcontrol.h> #include <linux/rseq.h> #include <linux/blk-cgroup.h> /** * set_notify_resume - cause resume_user_mode_work() to be called * @task: task that will call resume_user_mode_work() * * Calling this arranges that @task will call resume_user_mode_work() * before returning to user mode. If it's already running in user mode, * it will enter the kernel and call resume_user_mode_work() soon. * If it's blocked, it will not be woken. */ static inline void set_notify_resume(struct task_struct *task) { if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME)) kick_process(task); } /** * resume_user_mode_work - Perform work before returning to user mode * @regs: user-mode registers of @current task * * This is called when %TIF_NOTIFY_RESUME has been set. Now we are * about to return to user mode, and the user state in @regs can be * inspected or adjusted. The caller in arch code has cleared * %TIF_NOTIFY_RESUME before the call. If the flag gets set again * asynchronously, this will be called again before we return to * user mode. * * Called without locks. */ static inline void resume_user_mode_work(struct pt_regs *regs) { clear_thread_flag(TIF_NOTIFY_RESUME); /* * This barrier pairs with task_work_add()->set_notify_resume() after * hlist_add_head(task->task_works); */ smp_mb__after_atomic(); if (unlikely(task_work_pending(current))) task_work_run(); #ifdef CONFIG_KEYS_REQUEST_CACHE if (unlikely(current->cached_requested_key)) { key_put(current->cached_requested_key); current->cached_requested_key = NULL; } #endif mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); rseq_handle_notify_resume(NULL, regs); } #endif /* LINUX_RESUME_USER_MODE_H */
4 5 5 8 9 4 4 4 4 4 4 4 4 5 1 4 4 4 4 2 1 1 6 6 2 1 1 1 5 4 4 4 4 4 4 2 2 6 6 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 // SPDX-License-Identifier: GPL-2.0-or-later /* * CCM: Counter with CBC-MAC * * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> */ #include <crypto/internal/aead.h> #include <crypto/internal/cipher.h> #include <crypto/internal/hash.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> struct ccm_instance_ctx { struct crypto_skcipher_spawn ctr; struct crypto_ahash_spawn mac; }; struct crypto_ccm_ctx { struct crypto_ahash *mac; struct crypto_skcipher *ctr; }; struct crypto_rfc4309_ctx { struct crypto_aead *child; u8 nonce[3]; }; struct crypto_rfc4309_req_ctx { struct scatterlist src[3]; struct scatterlist dst[3]; struct aead_request subreq; }; struct crypto_ccm_req_priv_ctx { u8 odata[16]; u8 idata[16]; u8 auth_tag[16]; u32 flags; struct scatterlist src[3]; struct scatterlist dst[3]; union { struct ahash_request ahreq; struct skcipher_request skreq; }; }; struct cbcmac_tfm_ctx { struct crypto_cipher *child; }; struct cbcmac_desc_ctx { unsigned int len; u8 dg[]; }; static inline struct crypto_ccm_req_priv_ctx *crypto_ccm_reqctx( struct aead_request *req) { unsigned long align = crypto_aead_alignmask(crypto_aead_reqtfm(req)); return (void *)PTR_ALIGN((u8 *)aead_request_ctx(req), align + 1); } static int set_msg_len(u8 *block, unsigned int msglen, int csize) { __be32 data; memset(block, 0, csize); block += csize; if (csize >= 4) csize = 4; else if (msglen > (1 << (8 * csize))) return -EOVERFLOW; data = cpu_to_be32(msglen); memcpy(block - csize, (u8 *)&data + 4 - csize, csize); return 0; } static int crypto_ccm_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_skcipher *ctr = ctx->ctr; struct crypto_ahash *mac = ctx->mac; int err; crypto_skcipher_clear_flags(ctr, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(ctr, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(ctr, key, keylen); if (err) return err; crypto_ahash_clear_flags(mac, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(mac, crypto_aead_get_flags(aead) & CRYPTO_TFM_REQ_MASK); return crypto_ahash_setkey(mac, key, keylen); } static int crypto_ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { switch (authsize) { case 4: case 6: case 8: case 10: case 12: case 14: case 16: break; default: return -EINVAL; } return 0; } static int format_input(u8 *info, struct aead_request *req, unsigned int cryptlen) { struct crypto_aead *aead = crypto_aead_reqtfm(req); unsigned int lp = req->iv[0]; unsigned int l = lp + 1; unsigned int m; m = crypto_aead_authsize(aead); memcpy(info, req->iv, 16); /* format control info per RFC 3610 and * NIST Special Publication 800-38C */ *info |= (8 * ((m - 2) / 2)); if (req->assoclen) *info |= 64; return set_msg_len(info + 16 - l, cryptlen, l); } static int format_adata(u8 *adata, unsigned int a) { int len = 0; /* add control info for associated data * RFC 3610 and NIST Special Publication 800-38C */ if (a < 65280) { *(__be16 *)adata = cpu_to_be16(a); len = 2; } else { *(__be16 *)adata = cpu_to_be16(0xfffe); *(__be32 *)&adata[2] = cpu_to_be32(a); len = 6; } return len; } static int crypto_ccm_auth(struct aead_request *req, struct scatterlist *plain, unsigned int cryptlen) { struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct ahash_request *ahreq = &pctx->ahreq; unsigned int assoclen = req->assoclen; struct scatterlist sg[3]; u8 *odata = pctx->odata; u8 *idata = pctx->idata; int ilen, err; /* format control data for input */ err = format_input(odata, req, cryptlen); if (err) goto out; sg_init_table(sg, 3); sg_set_buf(&sg[0], odata, 16); /* format associated data and compute into mac */ if (assoclen) { ilen = format_adata(idata, assoclen); sg_set_buf(&sg[1], idata, ilen); sg_chain(sg, 3, req->src); } else { ilen = 0; sg_chain(sg, 2, req->src); } ahash_request_set_tfm(ahreq, ctx->mac); ahash_request_set_callback(ahreq, pctx->flags, NULL, NULL); ahash_request_set_crypt(ahreq, sg, NULL, assoclen + ilen + 16); err = crypto_ahash_init(ahreq); if (err) goto out; err = crypto_ahash_update(ahreq); if (err) goto out; /* we need to pad the MAC input to a round multiple of the block size */ ilen = 16 - (assoclen + ilen) % 16; if (ilen < 16) { memset(idata, 0, ilen); sg_init_table(sg, 2); sg_set_buf(&sg[0], idata, ilen); if (plain) sg_chain(sg, 2, plain); plain = sg; cryptlen += ilen; } ahash_request_set_crypt(ahreq, plain, odata, cryptlen); err = crypto_ahash_finup(ahreq); out: return err; } static void crypto_ccm_encrypt_done(void *data, int err) { struct aead_request *req = data; struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); u8 *odata = pctx->odata; if (!err) scatterwalk_map_and_copy(odata, req->dst, req->assoclen + req->cryptlen, crypto_aead_authsize(aead), 1); aead_request_complete(req, err); } static inline int crypto_ccm_check_iv(const u8 *iv) { /* 2 <= L <= 8, so 1 <= L' <= 7. */ if (1 > iv[0] || iv[0] > 7) return -EINVAL; return 0; } static int crypto_ccm_init_crypt(struct aead_request *req, u8 *tag) { struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct scatterlist *sg; u8 *iv = req->iv; int err; err = crypto_ccm_check_iv(iv); if (err) return err; pctx->flags = aead_request_flags(req); /* Note: rfc 3610 and NIST 800-38C require counter of * zero to encrypt auth tag. */ memset(iv + 15 - iv[0], 0, iv[0] + 1); sg_init_table(pctx->src, 3); sg_set_buf(pctx->src, tag, 16); sg = scatterwalk_ffwd(pctx->src + 1, req->src, req->assoclen); if (sg != pctx->src + 1) sg_chain(pctx->src, 2, sg); if (req->src != req->dst) { sg_init_table(pctx->dst, 3); sg_set_buf(pctx->dst, tag, 16); sg = scatterwalk_ffwd(pctx->dst + 1, req->dst, req->assoclen); if (sg != pctx->dst + 1) sg_chain(pctx->dst, 2, sg); } return 0; } static int crypto_ccm_encrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct skcipher_request *skreq = &pctx->skreq; struct scatterlist *dst; unsigned int cryptlen = req->cryptlen; u8 *odata = pctx->odata; u8 *iv = req->iv; int err; err = crypto_ccm_init_crypt(req, odata); if (err) return err; err = crypto_ccm_auth(req, sg_next(pctx->src), cryptlen); if (err) return err; dst = pctx->src; if (req->src != req->dst) dst = pctx->dst; skcipher_request_set_tfm(skreq, ctx->ctr); skcipher_request_set_callback(skreq, pctx->flags, crypto_ccm_encrypt_done, req); skcipher_request_set_crypt(skreq, pctx->src, dst, cryptlen + 16, iv); err = crypto_skcipher_encrypt(skreq); if (err) return err; /* copy authtag to end of dst */ scatterwalk_map_and_copy(odata, sg_next(dst), cryptlen, crypto_aead_authsize(aead), 1); return err; } static void crypto_ccm_decrypt_done(void *data, int err) { struct aead_request *req = data; struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct crypto_aead *aead = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(aead); unsigned int cryptlen = req->cryptlen - authsize; struct scatterlist *dst; pctx->flags = 0; dst = sg_next(req->src == req->dst ? pctx->src : pctx->dst); if (!err) { err = crypto_ccm_auth(req, dst, cryptlen); if (!err && crypto_memneq(pctx->auth_tag, pctx->odata, authsize)) err = -EBADMSG; } aead_request_complete(req, err); } static int crypto_ccm_decrypt(struct aead_request *req) { struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead); struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req); struct skcipher_request *skreq = &pctx->skreq; struct scatterlist *dst; unsigned int authsize = crypto_aead_authsize(aead); unsigned int cryptlen = req->cryptlen; u8 *authtag = pctx->auth_tag; u8 *odata = pctx->odata; u8 *iv = pctx->idata; int err; cryptlen -= authsize; err = crypto_ccm_init_crypt(req, authtag); if (err) return err; scatterwalk_map_and_copy(authtag, sg_next(pctx->src), cryptlen, authsize, 0); dst = pctx->src; if (req->src != req->dst) dst = pctx->dst; memcpy(iv, req->iv, 16); skcipher_request_set_tfm(skreq, ctx->ctr); skcipher_request_set_callback(skreq, pctx->flags, crypto_ccm_decrypt_done, req); skcipher_request_set_crypt(skreq, pctx->src, dst, cryptlen + 16, iv); err = crypto_skcipher_decrypt(skreq); if (err) return err; err = crypto_ccm_auth(req, sg_next(dst), cryptlen); if (err) return err; /* verify */ if (crypto_memneq(authtag, odata, authsize)) return -EBADMSG; return err; } static int crypto_ccm_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct ccm_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_ahash *mac; struct crypto_skcipher *ctr; unsigned long align; int err; mac = crypto_spawn_ahash(&ictx->mac); if (IS_ERR(mac)) return PTR_ERR(mac); ctr = crypto_spawn_skcipher(&ictx->ctr); err = PTR_ERR(ctr); if (IS_ERR(ctr)) goto err_free_mac; ctx->mac = mac; ctx->ctr = ctr; align = crypto_aead_alignmask(tfm); align &= ~(crypto_tfm_ctx_alignment() - 1); crypto_aead_set_reqsize( tfm, align + sizeof(struct crypto_ccm_req_priv_ctx) + max(crypto_ahash_reqsize(mac), crypto_skcipher_reqsize(ctr))); return 0; err_free_mac: crypto_free_ahash(mac); return err; } static void crypto_ccm_exit_tfm(struct crypto_aead *tfm) { struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_ahash(ctx->mac); crypto_free_skcipher(ctx->ctr); } static void crypto_ccm_free(struct aead_instance *inst) { struct ccm_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_ahash(&ctx->mac); crypto_drop_skcipher(&ctx->ctr); kfree(inst); } static int crypto_ccm_create_common(struct crypto_template *tmpl, struct rtattr **tb, const char *ctr_name, const char *mac_name) { struct skcipher_alg_common *ctr; u32 mask; struct aead_instance *inst; struct ccm_instance_ctx *ictx; struct hash_alg_common *mac; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ictx), GFP_KERNEL); if (!inst) return -ENOMEM; ictx = aead_instance_ctx(inst); err = crypto_grab_ahash(&ictx->mac, aead_crypto_instance(inst), mac_name, 0, mask | CRYPTO_ALG_ASYNC); if (err) goto err_free_inst; mac = crypto_spawn_ahash_alg(&ictx->mac); err = -EINVAL; if (strncmp(mac->base.cra_name, "cbcmac(", 7) != 0 || mac->digestsize != 16) goto err_free_inst; err = crypto_grab_skcipher(&ictx->ctr, aead_crypto_instance(inst), ctr_name, 0, mask); if (err) goto err_free_inst; ctr = crypto_spawn_skcipher_alg_common(&ictx->ctr); /* The skcipher algorithm must be CTR mode, using 16-byte blocks. */ err = -EINVAL; if (strncmp(ctr->base.cra_name, "ctr(", 4) != 0 || ctr->ivsize != 16 || ctr->base.cra_blocksize != 1) goto err_free_inst; /* ctr and cbcmac must use the same underlying block cipher. */ if (strcmp(ctr->base.cra_name + 4, mac->base.cra_name + 7) != 0) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "ccm(%s", ctr->base.cra_name + 4) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "ccm_base(%s,%s)", ctr->base.cra_driver_name, mac->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = (mac->base.cra_priority + ctr->base.cra_priority) / 2; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = ctr->base.cra_alignmask; inst->alg.ivsize = 16; inst->alg.chunksize = ctr->chunksize; inst->alg.maxauthsize = 16; inst->alg.base.cra_ctxsize = sizeof(struct crypto_ccm_ctx); inst->alg.init = crypto_ccm_init_tfm; inst->alg.exit = crypto_ccm_exit_tfm; inst->alg.setkey = crypto_ccm_setkey; inst->alg.setauthsize = crypto_ccm_setauthsize; inst->alg.encrypt = crypto_ccm_encrypt; inst->alg.decrypt = crypto_ccm_decrypt; inst->free = crypto_ccm_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_ccm_free(inst); } return err; } static int crypto_ccm_create(struct crypto_template *tmpl, struct rtattr **tb) { const char *cipher_name; char ctr_name[CRYPTO_MAX_ALG_NAME]; char mac_name[CRYPTO_MAX_ALG_NAME]; cipher_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(cipher_name)) return PTR_ERR(cipher_name); if (snprintf(ctr_name, CRYPTO_MAX_ALG_NAME, "ctr(%s)", cipher_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; if (snprintf(mac_name, CRYPTO_MAX_ALG_NAME, "cbcmac(%s)", cipher_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; return crypto_ccm_create_common(tmpl, tb, ctr_name, mac_name); } static int crypto_ccm_base_create(struct crypto_template *tmpl, struct rtattr **tb) { const char *ctr_name; const char *mac_name; ctr_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(ctr_name)) return PTR_ERR(ctr_name); mac_name = crypto_attr_alg_name(tb[2]); if (IS_ERR(mac_name)) return PTR_ERR(mac_name); return crypto_ccm_create_common(tmpl, tb, ctr_name, mac_name); } static int crypto_rfc4309_setkey(struct crypto_aead *parent, const u8 *key, unsigned int keylen) { struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(parent); struct crypto_aead *child = ctx->child; if (keylen < 3) return -EINVAL; keylen -= 3; memcpy(ctx->nonce, key + keylen, 3); crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(child, crypto_aead_get_flags(parent) & CRYPTO_TFM_REQ_MASK); return crypto_aead_setkey(child, key, keylen); } static int crypto_rfc4309_setauthsize(struct crypto_aead *parent, unsigned int authsize) { struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(parent); switch (authsize) { case 8: case 12: case 16: break; default: return -EINVAL; } return crypto_aead_setauthsize(ctx->child, authsize); } static struct aead_request *crypto_rfc4309_crypt(struct aead_request *req) { struct crypto_rfc4309_req_ctx *rctx = aead_request_ctx(req); struct aead_request *subreq = &rctx->subreq; struct crypto_aead *aead = crypto_aead_reqtfm(req); struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(aead); struct crypto_aead *child = ctx->child; struct scatterlist *sg; u8 *iv = PTR_ALIGN((u8 *)(subreq + 1) + crypto_aead_reqsize(child), crypto_aead_alignmask(child) + 1); /* L' */ iv[0] = 3; memcpy(iv + 1, ctx->nonce, 3); memcpy(iv + 4, req->iv, 8); scatterwalk_map_and_copy(iv + 16, req->src, 0, req->assoclen - 8, 0); sg_init_table(rctx->src, 3); sg_set_buf(rctx->src, iv + 16, req->assoclen - 8); sg = scatterwalk_ffwd(rctx->src + 1, req->src, req->assoclen); if (sg != rctx->src + 1) sg_chain(rctx->src, 2, sg); if (req->src != req->dst) { sg_init_table(rctx->dst, 3); sg_set_buf(rctx->dst, iv + 16, req->assoclen - 8); sg = scatterwalk_ffwd(rctx->dst + 1, req->dst, req->assoclen); if (sg != rctx->dst + 1) sg_chain(rctx->dst, 2, sg); } aead_request_set_tfm(subreq, child); aead_request_set_callback(subreq, req->base.flags, req->base.complete, req->base.data); aead_request_set_crypt(subreq, rctx->src, req->src == req->dst ? rctx->src : rctx->dst, req->cryptlen, iv); aead_request_set_ad(subreq, req->assoclen - 8); return subreq; } static int crypto_rfc4309_encrypt(struct aead_request *req) { if (req->assoclen != 16 && req->assoclen != 20) return -EINVAL; req = crypto_rfc4309_crypt(req); return crypto_aead_encrypt(req); } static int crypto_rfc4309_decrypt(struct aead_request *req) { if (req->assoclen != 16 && req->assoclen != 20) return -EINVAL; req = crypto_rfc4309_crypt(req); return crypto_aead_decrypt(req); } static int crypto_rfc4309_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct crypto_aead_spawn *spawn = aead_instance_ctx(inst); struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *aead; unsigned long align; aead = crypto_spawn_aead(spawn); if (IS_ERR(aead)) return PTR_ERR(aead); ctx->child = aead; align = crypto_aead_alignmask(aead); align &= ~(crypto_tfm_ctx_alignment() - 1); crypto_aead_set_reqsize( tfm, sizeof(struct crypto_rfc4309_req_ctx) + ALIGN(crypto_aead_reqsize(aead), crypto_tfm_ctx_alignment()) + align + 32); return 0; } static void crypto_rfc4309_exit_tfm(struct crypto_aead *tfm) { struct crypto_rfc4309_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_aead(ctx->child); } static void crypto_rfc4309_free(struct aead_instance *inst) { crypto_drop_aead(aead_instance_ctx(inst)); kfree(inst); } static int crypto_rfc4309_create(struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct aead_instance *inst; struct crypto_aead_spawn *spawn; struct aead_alg *alg; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = aead_instance_ctx(inst); err = crypto_grab_aead(spawn, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_aead_alg(spawn); err = -EINVAL; /* We only support 16-byte blocks. */ if (crypto_aead_alg_ivsize(alg) != 16) goto err_free_inst; /* Not a stream cipher? */ if (alg->base.cra_blocksize != 1) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "rfc4309(%s)", alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME || snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "rfc4309(%s)", alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = alg->base.cra_alignmask; inst->alg.ivsize = 8; inst->alg.chunksize = crypto_aead_alg_chunksize(alg); inst->alg.maxauthsize = 16; inst->alg.base.cra_ctxsize = sizeof(struct crypto_rfc4309_ctx); inst->alg.init = crypto_rfc4309_init_tfm; inst->alg.exit = crypto_rfc4309_exit_tfm; inst->alg.setkey = crypto_rfc4309_setkey; inst->alg.setauthsize = crypto_rfc4309_setauthsize; inst->alg.encrypt = crypto_rfc4309_encrypt; inst->alg.decrypt = crypto_rfc4309_decrypt; inst->free = crypto_rfc4309_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_rfc4309_free(inst); } return err; } static int crypto_cbcmac_digest_setkey(struct crypto_shash *parent, const u8 *inkey, unsigned int keylen) { struct cbcmac_tfm_ctx *ctx = crypto_shash_ctx(parent); return crypto_cipher_setkey(ctx->child, inkey, keylen); } static int crypto_cbcmac_digest_init(struct shash_desc *pdesc) { struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc); int bs = crypto_shash_digestsize(pdesc->tfm); ctx->len = 0; memset(ctx->dg, 0, bs); return 0; } static int crypto_cbcmac_digest_update(struct shash_desc *pdesc, const u8 *p, unsigned int len) { struct crypto_shash *parent = pdesc->tfm; struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_digestsize(parent); while (len > 0) { unsigned int l = min(len, bs - ctx->len); crypto_xor(&ctx->dg[ctx->len], p, l); ctx->len +=l; len -= l; p += l; if (ctx->len == bs) { crypto_cipher_encrypt_one(tfm, ctx->dg, ctx->dg); ctx->len = 0; } } return 0; } static int crypto_cbcmac_digest_final(struct shash_desc *pdesc, u8 *out) { struct crypto_shash *parent = pdesc->tfm; struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_digestsize(parent); if (ctx->len) crypto_cipher_encrypt_one(tfm, ctx->dg, ctx->dg); memcpy(out, ctx->dg, bs); return 0; } static int cbcmac_init_tfm(struct crypto_tfm *tfm) { struct crypto_cipher *cipher; struct crypto_instance *inst = (void *)tfm->__crt_alg; struct crypto_cipher_spawn *spawn = crypto_instance_ctx(inst); struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm); cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; return 0; }; static void cbcmac_exit_tfm(struct crypto_tfm *tfm) { struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm); crypto_free_cipher(ctx->child); } static int cbcmac_create(struct crypto_template *tmpl, struct rtattr **tb) { struct shash_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *alg; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = shash_instance_ctx(inst); err = crypto_grab_cipher(spawn, shash_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_cipher_alg(spawn); err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg); if (err) goto err_free_inst; inst->alg.base.cra_priority = alg->cra_priority; inst->alg.base.cra_blocksize = 1; inst->alg.digestsize = alg->cra_blocksize; inst->alg.descsize = sizeof(struct cbcmac_desc_ctx) + alg->cra_blocksize; inst->alg.base.cra_ctxsize = sizeof(struct cbcmac_tfm_ctx); inst->alg.base.cra_init = cbcmac_init_tfm; inst->alg.base.cra_exit = cbcmac_exit_tfm; inst->alg.init = crypto_cbcmac_digest_init; inst->alg.update = crypto_cbcmac_digest_update; inst->alg.final = crypto_cbcmac_digest_final; inst->alg.setkey = crypto_cbcmac_digest_setkey; inst->free = shash_free_singlespawn_instance; err = shash_register_instance(tmpl, inst); if (err) { err_free_inst: shash_free_singlespawn_instance(inst); } return err; } static struct crypto_template crypto_ccm_tmpls[] = { { .name = "cbcmac", .create = cbcmac_create, .module = THIS_MODULE, }, { .name = "ccm_base", .create = crypto_ccm_base_create, .module = THIS_MODULE, }, { .name = "ccm", .create = crypto_ccm_create, .module = THIS_MODULE, }, { .name = "rfc4309", .create = crypto_rfc4309_create, .module = THIS_MODULE, }, }; static int __init crypto_ccm_module_init(void) { return crypto_register_templates(crypto_ccm_tmpls, ARRAY_SIZE(crypto_ccm_tmpls)); } static void __exit crypto_ccm_module_exit(void) { crypto_unregister_templates(crypto_ccm_tmpls, ARRAY_SIZE(crypto_ccm_tmpls)); } subsys_initcall(crypto_ccm_module_init); module_exit(crypto_ccm_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Counter with CBC MAC"); MODULE_ALIAS_CRYPTO("ccm_base"); MODULE_ALIAS_CRYPTO("rfc4309"); MODULE_ALIAS_CRYPTO("ccm"); MODULE_ALIAS_CRYPTO("cbcmac"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
1945 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Global definitions for the Ethernet IEEE 802.3 interface. * * Version: @(#)if_ether.h 1.0.1a 02/08/94 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Steve Whitehouse, <gw7rrm@eeshack3.swan.ac.uk> */ #ifndef _LINUX_IF_ETHER_H #define _LINUX_IF_ETHER_H #include <linux/skbuff.h> #include <uapi/linux/if_ether.h> static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_mac_header(skb); } /* Prefer this version in TX path, instead of * skb_reset_mac_header() + eth_hdr() */ static inline struct ethhdr *skb_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb->data; } static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) { return (struct ethhdr *)skb_inner_mac_header(skb); } int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len); #endif /* _LINUX_IF_ETHER_H */
7 7 7 6 7 7 7 6 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 /* +++ deflate.c */ /* deflate.c -- compress data using the deflation algorithm * Copyright (C) 1995-1996 Jean-loup Gailly. * For conditions of distribution and use, see copyright notice in zlib.h */ /* * ALGORITHM * * The "deflation" process depends on being able to identify portions * of the input text which are identical to earlier input (within a * sliding window trailing behind the input currently being processed). * * The most straightforward technique turns out to be the fastest for * most input files: try all possible matches and select the longest. * The key feature of this algorithm is that insertions into the string * dictionary are very simple and thus fast, and deletions are avoided * completely. Insertions are performed at each input character, whereas * string matches are performed only when the previous match ends. So it * is preferable to spend more time in matches to allow very fast string * insertions and avoid deletions. The matching algorithm for small * strings is inspired from that of Rabin & Karp. A brute force approach * is used to find longer strings when a small match has been found. * A similar algorithm is used in comic (by Jan-Mark Wams) and freeze * (by Leonid Broukhis). * A previous version of this file used a more sophisticated algorithm * (by Fiala and Greene) which is guaranteed to run in linear amortized * time, but has a larger average cost, uses more memory and is patented. * However the F&G algorithm may be faster for some highly redundant * files if the parameter max_chain_length (described below) is too large. * * ACKNOWLEDGEMENTS * * The idea of lazy evaluation of matches is due to Jan-Mark Wams, and * I found it in 'freeze' written by Leonid Broukhis. * Thanks to many people for bug reports and testing. * * REFERENCES * * Deutsch, L.P.,"DEFLATE Compressed Data Format Specification". * Available in ftp://ds.internic.net/rfc/rfc1951.txt * * A description of the Rabin and Karp algorithm is given in the book * "Algorithms" by R. Sedgewick, Addison-Wesley, p252. * * Fiala,E.R., and Greene,D.H. * Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595 * */ #include <linux/module.h> #include <linux/zutil.h> #include "defutil.h" /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC # include "../zlib_dfltcc/dfltcc_deflate.h" #else #define DEFLATE_RESET_HOOK(strm) do {} while (0) #define DEFLATE_HOOK(strm, flush, bstate) 0 #define DEFLATE_NEED_CHECKSUM(strm) 1 #define DEFLATE_DFLTCC_ENABLED() 0 #endif /* =========================================================================== * Function prototypes. */ typedef block_state (*compress_func) (deflate_state *s, int flush); /* Compression function. Returns the block state after the call. */ static void fill_window (deflate_state *s); static block_state deflate_stored (deflate_state *s, int flush); static block_state deflate_fast (deflate_state *s, int flush); static block_state deflate_slow (deflate_state *s, int flush); static void lm_init (deflate_state *s); static void putShortMSB (deflate_state *s, uInt b); static int read_buf (z_streamp strm, Byte *buf, unsigned size); static uInt longest_match (deflate_state *s, IPos cur_match); #ifdef DEBUG_ZLIB static void check_match (deflate_state *s, IPos start, IPos match, int length); #endif /* =========================================================================== * Local data */ #define NIL 0 /* Tail of hash chains */ #ifndef TOO_FAR # define TOO_FAR 4096 #endif /* Matches of length 3 are discarded if their distance exceeds TOO_FAR */ #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) /* Minimum amount of lookahead, except at the end of the input file. * See deflate.c for comments about the MIN_MATCH+1. */ /* Workspace to be allocated for deflate processing */ typedef struct deflate_workspace { /* State memory for the deflator */ deflate_state deflate_memory; #ifdef CONFIG_ZLIB_DFLTCC /* State memory for s390 hardware deflate */ struct dfltcc_deflate_state dfltcc_memory; #endif Byte *window_memory; Pos *prev_memory; Pos *head_memory; char *overlay_memory; } deflate_workspace; #ifdef CONFIG_ZLIB_DFLTCC /* dfltcc_state must be doubleword aligned for DFLTCC call */ static_assert(offsetof(struct deflate_workspace, dfltcc_memory) % 8 == 0); #endif /* Values for max_lazy_match, good_match and max_chain_length, depending on * the desired pack level (0..9). The values given below have been tuned to * exclude worst case performance for pathological files. Better values may be * found for specific files. */ typedef struct config_s { ush good_length; /* reduce lazy search above this match length */ ush max_lazy; /* do not perform lazy search above this match length */ ush nice_length; /* quit search above this match length */ ush max_chain; compress_func func; } config; static const config configuration_table[10] = { /* good lazy nice chain */ /* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */ /* 1 */ {4, 4, 8, 4, deflate_fast}, /* maximum speed, no lazy matches */ /* 2 */ {4, 5, 16, 8, deflate_fast}, /* 3 */ {4, 6, 32, 32, deflate_fast}, /* 4 */ {4, 4, 16, 16, deflate_slow}, /* lazy matches */ /* 5 */ {8, 16, 32, 32, deflate_slow}, /* 6 */ {8, 16, 128, 128, deflate_slow}, /* 7 */ {8, 32, 128, 256, deflate_slow}, /* 8 */ {32, 128, 258, 1024, deflate_slow}, /* 9 */ {32, 258, 258, 4096, deflate_slow}}; /* maximum compression */ /* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4 * For deflate_fast() (levels <= 3) good is ignored and lazy has a different * meaning. */ #define EQUAL 0 /* result of memcmp for equal strings */ /* =========================================================================== * Update a hash value with the given input byte * IN assertion: all calls to UPDATE_HASH are made with consecutive * input characters, so that a running hash key can be computed from the * previous key instead of complete recalculation each time. */ #define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask) /* =========================================================================== * Insert string str in the dictionary and set match_head to the previous head * of the hash chain (the most recent string with same hash key). Return * the previous length of the hash chain. * IN assertion: all calls to INSERT_STRING are made with consecutive * input characters and the first MIN_MATCH bytes of str are valid * (except for the last MIN_MATCH-1 bytes of the input file). */ #define INSERT_STRING(s, str, match_head) \ (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \ s->prev[(str) & s->w_mask] = match_head = s->head[s->ins_h], \ s->head[s->ins_h] = (Pos)(str)) /* =========================================================================== * Initialize the hash table (avoiding 64K overflow for 16 bit systems). * prev[] will be initialized on the fly. */ #define CLEAR_HASH(s) \ s->head[s->hash_size-1] = NIL; \ memset((char *)s->head, 0, (unsigned)(s->hash_size-1)*sizeof(*s->head)); /* ========================================================================= */ int zlib_deflateInit2( z_streamp strm, int level, int method, int windowBits, int memLevel, int strategy ) { deflate_state *s; int noheader = 0; deflate_workspace *mem; char *next; ush *overlay; /* We overlay pending_buf and d_buf+l_buf. This works since the average * output size for (length,distance) codes is <= 24 bits. */ if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; if (level == Z_DEFAULT_COMPRESSION) level = 6; mem = (deflate_workspace *) strm->workspace; if (windowBits < 0) { /* undocumented feature: suppress zlib header */ noheader = 1; windowBits = -windowBits; } if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || method != Z_DEFLATED || windowBits < 9 || windowBits > 15 || level < 0 || level > 9 || strategy < 0 || strategy > Z_HUFFMAN_ONLY) { return Z_STREAM_ERROR; } /* * Direct the workspace's pointers to the chunks that were allocated * along with the deflate_workspace struct. */ next = (char *) mem; next += sizeof(*mem); #ifdef CONFIG_ZLIB_DFLTCC /* * DFLTCC requires the window to be page aligned. * Thus, we overallocate and take the aligned portion of the buffer. */ mem->window_memory = (Byte *) PTR_ALIGN(next, PAGE_SIZE); #else mem->window_memory = (Byte *) next; #endif next += zlib_deflate_window_memsize(windowBits); mem->prev_memory = (Pos *) next; next += zlib_deflate_prev_memsize(windowBits); mem->head_memory = (Pos *) next; next += zlib_deflate_head_memsize(memLevel); mem->overlay_memory = next; s = (deflate_state *) &(mem->deflate_memory); strm->state = (struct internal_state *)s; s->strm = strm; s->noheader = noheader; s->w_bits = windowBits; s->w_size = 1 << s->w_bits; s->w_mask = s->w_size - 1; s->hash_bits = memLevel + 7; s->hash_size = 1 << s->hash_bits; s->hash_mask = s->hash_size - 1; s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH); s->window = (Byte *) mem->window_memory; s->prev = (Pos *) mem->prev_memory; s->head = (Pos *) mem->head_memory; s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */ overlay = (ush *) mem->overlay_memory; s->pending_buf = (uch *) overlay; s->pending_buf_size = (ulg)s->lit_bufsize * (sizeof(ush)+2L); s->d_buf = overlay + s->lit_bufsize/sizeof(ush); s->l_buf = s->pending_buf + (1+sizeof(ush))*s->lit_bufsize; s->level = level; s->strategy = strategy; s->method = (Byte)method; return zlib_deflateReset(strm); } /* ========================================================================= */ int zlib_deflateReset( z_streamp strm ) { deflate_state *s; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; strm->total_in = strm->total_out = 0; strm->msg = NULL; strm->data_type = Z_UNKNOWN; s = (deflate_state *)strm->state; s->pending = 0; s->pending_out = s->pending_buf; if (s->noheader < 0) { s->noheader = 0; /* was set to -1 by deflate(..., Z_FINISH); */ } s->status = s->noheader ? BUSY_STATE : INIT_STATE; strm->adler = 1; s->last_flush = Z_NO_FLUSH; zlib_tr_init(s); lm_init(s); DEFLATE_RESET_HOOK(strm); return Z_OK; } /* ========================================================================= * Put a short in the pending buffer. The 16-bit value is put in MSB order. * IN assertion: the stream state is correct and there is enough room in * pending_buf. */ static void putShortMSB( deflate_state *s, uInt b ) { put_byte(s, (Byte)(b >> 8)); put_byte(s, (Byte)(b & 0xff)); } /* ========================================================================= */ int zlib_deflate( z_streamp strm, int flush ) { int old_flush; /* value of flush param for previous deflate call */ deflate_state *s; if (strm == NULL || strm->state == NULL || flush > Z_FINISH || flush < 0) { return Z_STREAM_ERROR; } s = (deflate_state *) strm->state; if ((strm->next_in == NULL && strm->avail_in != 0) || (s->status == FINISH_STATE && flush != Z_FINISH)) { return Z_STREAM_ERROR; } if (strm->avail_out == 0) return Z_BUF_ERROR; s->strm = strm; /* just in case */ old_flush = s->last_flush; s->last_flush = flush; /* Write the zlib header */ if (s->status == INIT_STATE) { uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8; uInt level_flags = (s->level-1) >> 1; if (level_flags > 3) level_flags = 3; header |= (level_flags << 6); if (s->strstart != 0) header |= PRESET_DICT; header += 31 - (header % 31); s->status = BUSY_STATE; putShortMSB(s, header); /* Save the adler32 of the preset dictionary: */ if (s->strstart != 0) { putShortMSB(s, (uInt)(strm->adler >> 16)); putShortMSB(s, (uInt)(strm->adler & 0xffff)); } strm->adler = 1L; } /* Flush as much pending output as possible */ if (s->pending != 0) { flush_pending(strm); if (strm->avail_out == 0) { /* Since avail_out is 0, deflate will be called again with * more output space, but possibly with both pending and * avail_in equal to zero. There won't be anything to do, * but this is not an error situation so make sure we * return OK instead of BUF_ERROR at next call of deflate: */ s->last_flush = -1; return Z_OK; } /* Make sure there is something to do and avoid duplicate consecutive * flushes. For repeated and useless calls with Z_FINISH, we keep * returning Z_STREAM_END instead of Z_BUFF_ERROR. */ } else if (strm->avail_in == 0 && flush <= old_flush && flush != Z_FINISH) { return Z_BUF_ERROR; } /* User must not provide more input after the first FINISH: */ if (s->status == FINISH_STATE && strm->avail_in != 0) { return Z_BUF_ERROR; } /* Start a new block or continue the current one. */ if (strm->avail_in != 0 || s->lookahead != 0 || (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) { block_state bstate; bstate = DEFLATE_HOOK(strm, flush, &bstate) ? bstate : (*(configuration_table[s->level].func))(s, flush); if (bstate == finish_started || bstate == finish_done) { s->status = FINISH_STATE; } if (bstate == need_more || bstate == finish_started) { if (strm->avail_out == 0) { s->last_flush = -1; /* avoid BUF_ERROR next call, see above */ } return Z_OK; /* If flush != Z_NO_FLUSH && avail_out == 0, the next call * of deflate should use the same flush parameter to make sure * that the flush is complete. So we don't have to output an * empty block here, this will be done at next call. This also * ensures that for a very small output buffer, we emit at most * one empty block. */ } if (bstate == block_done) { if (flush == Z_PARTIAL_FLUSH) { zlib_tr_align(s); } else if (flush == Z_PACKET_FLUSH) { /* Output just the 3-bit `stored' block type value, but not a zero length. */ zlib_tr_stored_type_only(s); } else { /* FULL_FLUSH or SYNC_FLUSH */ zlib_tr_stored_block(s, (char*)0, 0L, 0); /* For a full flush, this empty block will be recognized * as a special marker by inflate_sync(). */ if (flush == Z_FULL_FLUSH) { CLEAR_HASH(s); /* forget history */ } } flush_pending(strm); if (strm->avail_out == 0) { s->last_flush = -1; /* avoid BUF_ERROR at next call, see above */ return Z_OK; } } } Assert(strm->avail_out > 0, "bug2"); if (flush != Z_FINISH) return Z_OK; if (!s->noheader) { /* Write zlib trailer (adler32) */ putShortMSB(s, (uInt)(strm->adler >> 16)); putShortMSB(s, (uInt)(strm->adler & 0xffff)); } flush_pending(strm); /* If avail_out is zero, the application will call deflate again * to flush the rest. */ if (!s->noheader) { s->noheader = -1; /* write the trailer only once! */ } if (s->pending == 0) { Assert(s->bi_valid == 0, "bi_buf not flushed"); return Z_STREAM_END; } return Z_OK; } /* ========================================================================= */ int zlib_deflateEnd( z_streamp strm ) { int status; deflate_state *s; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; s = (deflate_state *) strm->state; status = s->status; if (status != INIT_STATE && status != BUSY_STATE && status != FINISH_STATE) { return Z_STREAM_ERROR; } strm->state = NULL; return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK; } /* =========================================================================== * Read a new buffer from the current input stream, update the adler32 * and total number of bytes read. All deflate() input goes through * this function so some applications may wish to modify it to avoid * allocating a large strm->next_in buffer and copying from it. * (See also flush_pending()). */ static int read_buf( z_streamp strm, Byte *buf, unsigned size ) { unsigned len = strm->avail_in; if (len > size) len = size; if (len == 0) return 0; strm->avail_in -= len; if (!DEFLATE_NEED_CHECKSUM(strm)) {} else if (!((deflate_state *)(strm->state))->noheader) { strm->adler = zlib_adler32(strm->adler, strm->next_in, len); } memcpy(buf, strm->next_in, len); strm->next_in += len; strm->total_in += len; return (int)len; } /* =========================================================================== * Initialize the "longest match" routines for a new zlib stream */ static void lm_init( deflate_state *s ) { s->window_size = (ulg)2L*s->w_size; CLEAR_HASH(s); /* Set the default configuration parameters: */ s->max_lazy_match = configuration_table[s->level].max_lazy; s->good_match = configuration_table[s->level].good_length; s->nice_match = configuration_table[s->level].nice_length; s->max_chain_length = configuration_table[s->level].max_chain; s->strstart = 0; s->block_start = 0L; s->lookahead = 0; s->match_length = s->prev_length = MIN_MATCH-1; s->match_available = 0; s->ins_h = 0; } /* =========================================================================== * Set match_start to the longest match starting at the given string and * return its length. Matches shorter or equal to prev_length are discarded, * in which case the result is equal to prev_length and match_start is * garbage. * IN assertions: cur_match is the head of the hash chain for the current * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1 * OUT assertion: the match length is not greater than s->lookahead. */ /* For 80x86 and 680x0, an optimized version will be provided in match.asm or * match.S. The code will be functionally equivalent. */ static uInt longest_match( deflate_state *s, IPos cur_match /* current match */ ) { unsigned chain_length = s->max_chain_length;/* max hash chain length */ register Byte *scan = s->window + s->strstart; /* current string */ register Byte *match; /* matched string */ register int len; /* length of current match */ int best_len = s->prev_length; /* best match length so far */ int nice_match = s->nice_match; /* stop if match long enough */ IPos limit = s->strstart > (IPos)MAX_DIST(s) ? s->strstart - (IPos)MAX_DIST(s) : NIL; /* Stop when cur_match becomes <= limit. To simplify the code, * we prevent matches with the string of window index 0. */ Pos *prev = s->prev; uInt wmask = s->w_mask; #ifdef UNALIGNED_OK /* Compare two bytes at a time. Note: this is not always beneficial. * Try with and without -DUNALIGNED_OK to check. */ register Byte *strend = s->window + s->strstart + MAX_MATCH - 1; register ush scan_start = *(ush*)scan; register ush scan_end = *(ush*)(scan+best_len-1); #else register Byte *strend = s->window + s->strstart + MAX_MATCH; register Byte scan_end1 = scan[best_len-1]; register Byte scan_end = scan[best_len]; #endif /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. * It is easy to get rid of this optimization if necessary. */ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); /* Do not waste too much time if we already have a good match: */ if (s->prev_length >= s->good_match) { chain_length >>= 2; } /* Do not look for matches beyond the end of the input. This is necessary * to make deflate deterministic. */ if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); do { Assert(cur_match < s->strstart, "no future"); match = s->window + cur_match; /* Skip to next match if the match length cannot increase * or if the match length is less than 2: */ #if (defined(UNALIGNED_OK) && MAX_MATCH == 258) /* This code assumes sizeof(unsigned short) == 2. Do not use * UNALIGNED_OK if your compiler uses a different size. */ if (*(ush*)(match+best_len-1) != scan_end || *(ush*)match != scan_start) continue; /* It is not necessary to compare scan[2] and match[2] since they are * always equal when the other bytes match, given that the hash keys * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at * strstart+3, +5, ... up to strstart+257. We check for insufficient * lookahead only every 4th comparison; the 128th check will be made * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is * necessary to put more guard bytes at the end of the window, or * to check more often for insufficient lookahead. */ Assert(scan[2] == match[2], "scan[2]?"); scan++, match++; do { } while (*(ush*)(scan+=2) == *(ush*)(match+=2) && *(ush*)(scan+=2) == *(ush*)(match+=2) && *(ush*)(scan+=2) == *(ush*)(match+=2) && *(ush*)(scan+=2) == *(ush*)(match+=2) && scan < strend); /* The funny "do {}" generates better code on most compilers */ /* Here, scan <= window+strstart+257 */ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); if (*scan == *match) scan++; len = (MAX_MATCH - 1) - (int)(strend-scan); scan = strend - (MAX_MATCH-1); #else /* UNALIGNED_OK */ if (match[best_len] != scan_end || match[best_len-1] != scan_end1 || *match != *scan || *++match != scan[1]) continue; /* The check at best_len-1 can be removed because it will be made * again later. (This heuristic is not always a win.) * It is not necessary to compare scan[2] and match[2] since they * are always equal when the other bytes match, given that * the hash keys are equal and that HASH_BITS >= 8. */ scan += 2, match++; Assert(*scan == *match, "match[2]?"); /* We check for insufficient lookahead only every 8th comparison; * the 256th check will be made at strstart+258. */ do { } while (*++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && scan < strend); Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); len = MAX_MATCH - (int)(strend - scan); scan = strend - MAX_MATCH; #endif /* UNALIGNED_OK */ if (len > best_len) { s->match_start = cur_match; best_len = len; if (len >= nice_match) break; #ifdef UNALIGNED_OK scan_end = *(ush*)(scan+best_len-1); #else scan_end1 = scan[best_len-1]; scan_end = scan[best_len]; #endif } } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0); if ((uInt)best_len <= s->lookahead) return best_len; return s->lookahead; } #ifdef DEBUG_ZLIB /* =========================================================================== * Check that the match at match_start is indeed a match. */ static void check_match( deflate_state *s, IPos start, IPos match, int length ) { /* check that the match is indeed a match */ if (memcmp((char *)s->window + match, (char *)s->window + start, length) != EQUAL) { fprintf(stderr, " start %u, match %u, length %d\n", start, match, length); do { fprintf(stderr, "%c%c", s->window[match++], s->window[start++]); } while (--length != 0); z_error("invalid match"); } if (z_verbose > 1) { fprintf(stderr,"\\[%d,%d]", start-match, length); do { putc(s->window[start++], stderr); } while (--length != 0); } } #else # define check_match(s, start, match, length) #endif /* =========================================================================== * Fill the window when the lookahead becomes insufficient. * Updates strstart and lookahead. * * IN assertion: lookahead < MIN_LOOKAHEAD * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD * At least one byte has been read, or avail_in == 0; reads are * performed for at least two bytes (required for the zip translate_eol * option -- not supported here). */ static void fill_window( deflate_state *s ) { register unsigned n, m; register Pos *p; unsigned more; /* Amount of free space at the end of the window. */ uInt wsize = s->w_size; do { more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart); /* Deal with !@#$% 64K limit: */ if (more == 0 && s->strstart == 0 && s->lookahead == 0) { more = wsize; } else if (more == (unsigned)(-1)) { /* Very unlikely, but possible on 16 bit machine if strstart == 0 * and lookahead == 1 (input done one byte at time) */ more--; /* If the window is almost full and there is insufficient lookahead, * move the upper half to the lower one to make room in the upper half. */ } else if (s->strstart >= wsize+MAX_DIST(s)) { memcpy((char *)s->window, (char *)s->window+wsize, (unsigned)wsize); s->match_start -= wsize; s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ s->block_start -= (long) wsize; /* Slide the hash table (could be avoided with 32 bit values at the expense of memory usage). We slide even when level == 0 to keep the hash table consistent if we switch back to level > 0 later. (Using level 0 permanently is not an optimal usage of zlib, so we don't care about this pathological case.) */ n = s->hash_size; p = &s->head[n]; do { m = *--p; *p = (Pos)(m >= wsize ? m-wsize : NIL); } while (--n); n = wsize; p = &s->prev[n]; do { m = *--p; *p = (Pos)(m >= wsize ? m-wsize : NIL); /* If n is not on any hash chain, prev[n] is garbage but * its value will never be used. */ } while (--n); more += wsize; } if (s->strm->avail_in == 0) return; /* If there was no sliding: * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && * more == window_size - lookahead - strstart * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) * => more >= window_size - 2*WSIZE + 2 * In the BIG_MEM or MMAP case (not yet supported), * window_size == input_size + MIN_LOOKAHEAD && * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD. * Otherwise, window_size == 2*WSIZE so more >= 2. * If there was sliding, more >= WSIZE. So in all cases, more >= 2. */ Assert(more >= 2, "more < 2"); n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more); s->lookahead += n; /* Initialize the hash value now that we have some input: */ if (s->lookahead >= MIN_MATCH) { s->ins_h = s->window[s->strstart]; UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); #if MIN_MATCH != 3 Call UPDATE_HASH() MIN_MATCH-3 more times #endif } /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage, * but this is not important since only literal bytes will be emitted. */ } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0); } /* =========================================================================== * Flush the current block, with given end-of-file flag. * IN assertion: strstart is set to the end of the current match. */ #define FLUSH_BLOCK_ONLY(s, eof) { \ zlib_tr_flush_block(s, (s->block_start >= 0L ? \ (char *)&s->window[(unsigned)s->block_start] : \ NULL), \ (ulg)((long)s->strstart - s->block_start), \ (eof)); \ s->block_start = s->strstart; \ flush_pending(s->strm); \ Tracev((stderr,"[FLUSH]")); \ } /* Same but force premature exit if necessary. */ #define FLUSH_BLOCK(s, eof) { \ FLUSH_BLOCK_ONLY(s, eof); \ if (s->strm->avail_out == 0) return (eof) ? finish_started : need_more; \ } /* =========================================================================== * Copy without compression as much as possible from the input stream, return * the current block state. * This function does not insert new strings in the dictionary since * uncompressible data is probably not useful. This function is used * only for the level=0 compression option. * NOTE: this function should be optimized to avoid extra copying from * window to pending_buf. */ static block_state deflate_stored( deflate_state *s, int flush ) { /* Stored blocks are limited to 0xffff bytes, pending_buf is limited * to pending_buf_size, and each stored block has a 5 byte header: */ ulg max_block_size = 0xffff; ulg max_start; if (max_block_size > s->pending_buf_size - 5) { max_block_size = s->pending_buf_size - 5; } /* Copy as much as possible from input to output: */ for (;;) { /* Fill the window as much as possible: */ if (s->lookahead <= 1) { Assert(s->strstart < s->w_size+MAX_DIST(s) || s->block_start >= (long)s->w_size, "slide too late"); fill_window(s); if (s->lookahead == 0 && flush == Z_NO_FLUSH) return need_more; if (s->lookahead == 0) break; /* flush the current block */ } Assert(s->block_start >= 0L, "block gone"); s->strstart += s->lookahead; s->lookahead = 0; /* Emit a stored block if pending_buf will be full: */ max_start = s->block_start + max_block_size; if (s->strstart == 0 || (ulg)s->strstart >= max_start) { /* strstart == 0 is possible when wraparound on 16-bit machine */ s->lookahead = (uInt)(s->strstart - max_start); s->strstart = (uInt)max_start; FLUSH_BLOCK(s, 0); } /* Flush if we may have to slide, otherwise block_start may become * negative and the data will be gone: */ if (s->strstart - (uInt)s->block_start >= MAX_DIST(s)) { FLUSH_BLOCK(s, 0); } } FLUSH_BLOCK(s, flush == Z_FINISH); return flush == Z_FINISH ? finish_done : block_done; } /* =========================================================================== * Compress as much as possible from the input stream, return the current * block state. * This function does not perform lazy evaluation of matches and inserts * new strings in the dictionary only for unmatched strings or for short * matches. It is used only for the fast compression options. */ static block_state deflate_fast( deflate_state *s, int flush ) { IPos hash_head = NIL; /* head of the hash chain */ int bflush; /* set if current block must be flushed */ for (;;) { /* Make sure that we always have enough lookahead, except * at the end of the input file. We need MAX_MATCH bytes * for the next match, plus MIN_MATCH bytes to insert the * string following the next match. */ if (s->lookahead < MIN_LOOKAHEAD) { fill_window(s); if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { return need_more; } if (s->lookahead == 0) break; /* flush the current block */ } /* Insert the string window[strstart .. strstart+2] in the * dictionary, and set hash_head to the head of the hash chain: */ if (s->lookahead >= MIN_MATCH) { INSERT_STRING(s, s->strstart, hash_head); } /* Find the longest match, discarding those <= prev_length. * At this point we have always match_length < MIN_MATCH */ if (hash_head != NIL && s->strstart - hash_head <= MAX_DIST(s)) { /* To simplify the code, we prevent matches with the string * of window index 0 (in particular we have to avoid a match * of the string with itself at the start of the input file). */ if (s->strategy != Z_HUFFMAN_ONLY) { s->match_length = longest_match (s, hash_head); } /* longest_match() sets match_start */ } if (s->match_length >= MIN_MATCH) { check_match(s, s->strstart, s->match_start, s->match_length); bflush = zlib_tr_tally(s, s->strstart - s->match_start, s->match_length - MIN_MATCH); s->lookahead -= s->match_length; /* Insert new strings in the hash table only if the match length * is not too large. This saves time but degrades compression. */ if (s->match_length <= s->max_insert_length && s->lookahead >= MIN_MATCH) { s->match_length--; /* string at strstart already in hash table */ do { s->strstart++; INSERT_STRING(s, s->strstart, hash_head); /* strstart never exceeds WSIZE-MAX_MATCH, so there are * always MIN_MATCH bytes ahead. */ } while (--s->match_length != 0); s->strstart++; } else { s->strstart += s->match_length; s->match_length = 0; s->ins_h = s->window[s->strstart]; UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); #if MIN_MATCH != 3 Call UPDATE_HASH() MIN_MATCH-3 more times #endif /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not * matter since it will be recomputed at next deflate call. */ } } else { /* No match, output a literal byte */ Tracevv((stderr,"%c", s->window[s->strstart])); bflush = zlib_tr_tally (s, 0, s->window[s->strstart]); s->lookahead--; s->strstart++; } if (bflush) FLUSH_BLOCK(s, 0); } FLUSH_BLOCK(s, flush == Z_FINISH); return flush == Z_FINISH ? finish_done : block_done; } /* =========================================================================== * Same as above, but achieves better compression. We use a lazy * evaluation for matches: a match is finally adopted only if there is * no better match at the next window position. */ static block_state deflate_slow( deflate_state *s, int flush ) { IPos hash_head = NIL; /* head of hash chain */ int bflush; /* set if current block must be flushed */ /* Process the input block. */ for (;;) { /* Make sure that we always have enough lookahead, except * at the end of the input file. We need MAX_MATCH bytes * for the next match, plus MIN_MATCH bytes to insert the * string following the next match. */ if (s->lookahead < MIN_LOOKAHEAD) { fill_window(s); if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { return need_more; } if (s->lookahead == 0) break; /* flush the current block */ } /* Insert the string window[strstart .. strstart+2] in the * dictionary, and set hash_head to the head of the hash chain: */ if (s->lookahead >= MIN_MATCH) { INSERT_STRING(s, s->strstart, hash_head); } /* Find the longest match, discarding those <= prev_length. */ s->prev_length = s->match_length, s->prev_match = s->match_start; s->match_length = MIN_MATCH-1; if (hash_head != NIL && s->prev_length < s->max_lazy_match && s->strstart - hash_head <= MAX_DIST(s)) { /* To simplify the code, we prevent matches with the string * of window index 0 (in particular we have to avoid a match * of the string with itself at the start of the input file). */ if (s->strategy != Z_HUFFMAN_ONLY) { s->match_length = longest_match (s, hash_head); } /* longest_match() sets match_start */ if (s->match_length <= 5 && (s->strategy == Z_FILTERED || (s->match_length == MIN_MATCH && s->strstart - s->match_start > TOO_FAR))) { /* If prev_match is also MIN_MATCH, match_start is garbage * but we will ignore the current match anyway. */ s->match_length = MIN_MATCH-1; } } /* If there was a match at the previous step and the current * match is not better, output the previous match: */ if (s->prev_length >= MIN_MATCH && s->match_length <= s->prev_length) { uInt max_insert = s->strstart + s->lookahead - MIN_MATCH; /* Do not insert strings in hash table beyond this. */ check_match(s, s->strstart-1, s->prev_match, s->prev_length); bflush = zlib_tr_tally(s, s->strstart -1 - s->prev_match, s->prev_length - MIN_MATCH); /* Insert in hash table all strings up to the end of the match. * strstart-1 and strstart are already inserted. If there is not * enough lookahead, the last two strings are not inserted in * the hash table. */ s->lookahead -= s->prev_length-1; s->prev_length -= 2; do { if (++s->strstart <= max_insert) { INSERT_STRING(s, s->strstart, hash_head); } } while (--s->prev_length != 0); s->match_available = 0; s->match_length = MIN_MATCH-1; s->strstart++; if (bflush) FLUSH_BLOCK(s, 0); } else if (s->match_available) { /* If there was no match at the previous position, output a * single literal. If there was a match but the current match * is longer, truncate the previous match to a single literal. */ Tracevv((stderr,"%c", s->window[s->strstart-1])); if (zlib_tr_tally (s, 0, s->window[s->strstart-1])) { FLUSH_BLOCK_ONLY(s, 0); } s->strstart++; s->lookahead--; if (s->strm->avail_out == 0) return need_more; } else { /* There is no previous match to compare with, wait for * the next step to decide. */ s->match_available = 1; s->strstart++; s->lookahead--; } } Assert (flush != Z_NO_FLUSH, "no flush?"); if (s->match_available) { Tracevv((stderr,"%c", s->window[s->strstart-1])); zlib_tr_tally (s, 0, s->window[s->strstart-1]); s->match_available = 0; } FLUSH_BLOCK(s, flush == Z_FINISH); return flush == Z_FINISH ? finish_done : block_done; } int zlib_deflate_workspacesize(int windowBits, int memLevel) { if (windowBits < 0) /* undocumented feature: suppress zlib header */ windowBits = -windowBits; /* Since the return value is typically passed to vmalloc() unchecked... */ BUG_ON(memLevel < 1 || memLevel > MAX_MEM_LEVEL || windowBits < 9 || windowBits > 15); return sizeof(deflate_workspace) + zlib_deflate_window_memsize(windowBits) + zlib_deflate_prev_memsize(windowBits) + zlib_deflate_head_memsize(memLevel) + zlib_deflate_overlay_memsize(memLevel); } int zlib_deflate_dfltcc_enabled(void) { return DEFLATE_DFLTCC_ENABLED(); }
4 2 2 2 5 4 5 2 5 5 3 3 1 3 3 3 3 3 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 // SPDX-License-Identifier: GPL-2.0-or-later /* DataCenter TCP (DCTCP) congestion control. * * http://simula.stanford.edu/~alizade/Site/DCTCP.html * * This is an implementation of DCTCP over Reno, an enhancement to the * TCP congestion control algorithm designed for data centers. DCTCP * leverages Explicit Congestion Notification (ECN) in the network to * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet * the following three data center transport requirements: * * - High burst tolerance (incast due to partition/aggregate) * - Low latency (short flows, queries) * - High throughput (continuous data updates, large file transfers) * with commodity shallow buffered switches * * The algorithm is described in detail in the following two papers: * * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: * "Data Center TCP (DCTCP)", Data Center Networks session * Proc. ACM SIGCOMM, New Delhi, 2010. * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf * * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: * "Analysis of DCTCP: Stability, Convergence, and Fairness" * Proc. ACM SIGMETRICS, San Jose, 2011. * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf * * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh. * * Authors: * * Daniel Borkmann <dborkman@redhat.com> * Florian Westphal <fw@strlen.de> * Glenn Judd <glenn.judd@morganstanley.com> */ #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/module.h> #include <linux/mm.h> #include <net/tcp.h> #include <linux/inet_diag.h> #include "tcp_dctcp.h" #define DCTCP_MAX_ALPHA 1024U struct dctcp { u32 old_delivered; u32 old_delivered_ce; u32 prior_rcv_nxt; u32 dctcp_alpha; u32 next_seq; u32 ce_state; u32 loss_cwnd; struct tcp_plb_state plb; }; static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ static int dctcp_shift_g_set(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, 0, 10); } static const struct kernel_param_ops dctcp_shift_g_ops = { .set = dctcp_shift_g_set, .get = param_get_uint, }; module_param_cb(dctcp_shift_g, &dctcp_shift_g_ops, &dctcp_shift_g, 0644); MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha"); static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; module_param(dctcp_alpha_on_init, uint, 0644); MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value"); static struct tcp_congestion_ops dctcp_reno; static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) { ca->next_seq = tp->snd_nxt; ca->old_delivered = tp->delivered; ca->old_delivered_ce = tp->delivered_ce; } __bpf_kfunc static void dctcp_init(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if ((tp->ecn_flags & TCP_ECN_OK) || (sk->sk_state == TCP_LISTEN || sk->sk_state == TCP_CLOSE)) { struct dctcp *ca = inet_csk_ca(sk); ca->prior_rcv_nxt = tp->rcv_nxt; ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); ca->loss_cwnd = 0; ca->ce_state = 0; dctcp_reset(tp, ca); tcp_plb_init(sk, &ca->plb); return; } /* No ECN support? Fall back to Reno. Also need to clear * ECT from sk since it is set during 3WHS for DCTCP. */ inet_csk(sk)->icsk_ca_ops = &dctcp_reno; INET_ECN_dontxmit(sk); } __bpf_kfunc static u32 dctcp_ssthresh(struct sock *sk) { struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tcp_snd_cwnd(tp); return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U); } __bpf_kfunc static void dctcp_update_alpha(struct sock *sk, u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); struct dctcp *ca = inet_csk_ca(sk); /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { u32 delivered = tp->delivered - ca->old_delivered; u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce; u32 alpha = ca->dctcp_alpha; u32 ce_ratio = 0; if (delivered > 0) { /* dctcp_alpha keeps EWMA of fraction of ECN marked * packets. Because of EWMA smoothing, PLB reaction can * be slow so we use ce_ratio which is an instantaneous * measure of congestion. ce_ratio is the fraction of * ECN marked packets in the previous RTT. */ if (delivered_ce > 0) ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered; tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio); tcp_plb_check_rehash(sk, &ca->plb); } /* alpha = (1 - g) * alpha + g * F */ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); if (delivered_ce) { /* If dctcp_shift_g == 1, a 32bit value would overflow * after 8 M packets. */ delivered_ce <<= (10 - dctcp_shift_g); delivered_ce /= max(1U, delivered); alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA); } /* dctcp_alpha can be read from dctcp_get_info() without * synchro, so we ask compiler to not use dctcp_alpha * as a temporary variable in prior operations. */ WRITE_ONCE(ca->dctcp_alpha, alpha); dctcp_reset(tp, ca); } } static void dctcp_react_to_loss(struct sock *sk) { struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tcp_snd_cwnd(tp); tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U); } __bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Recovery && new_state != inet_csk(sk)->icsk_ca_state) dctcp_react_to_loss(sk); /* We handle RTO in dctcp_cwnd_event to ensure that we perform only * one loss-adjustment per RTT. */ } __bpf_kfunc static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { struct dctcp *ca = inet_csk_ca(sk); switch (ev) { case CA_EVENT_ECN_IS_CE: case CA_EVENT_ECN_NO_CE: dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); break; case CA_EVENT_LOSS: tcp_plb_update_state_upon_rto(sk, &ca->plb); dctcp_react_to_loss(sk); break; case CA_EVENT_TX_START: tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */ break; default: /* Don't care for the rest. */ break; } } static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info) { const struct dctcp *ca = inet_csk_ca(sk); const struct tcp_sock *tp = tcp_sk(sk); /* Fill it also in case of VEGASINFO due to req struct limits. * We can still correctly retrieve it later. */ if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || ext & (1 << (INET_DIAG_VEGASINFO - 1))) { memset(&info->dctcp, 0, sizeof(info->dctcp)); if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { info->dctcp.dctcp_enabled = 1; info->dctcp.dctcp_ce_state = (u16) ca->ce_state; info->dctcp.dctcp_alpha = ca->dctcp_alpha; info->dctcp.dctcp_ab_ecn = tp->mss_cache * (tp->delivered_ce - ca->old_delivered_ce); info->dctcp.dctcp_ab_tot = tp->mss_cache * (tp->delivered - ca->old_delivered); } *attr = INET_DIAG_DCTCPINFO; return sizeof(info->dctcp); } return 0; } __bpf_kfunc static u32 dctcp_cwnd_undo(struct sock *sk) { const struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp), ca->loss_cwnd); } static struct tcp_congestion_ops dctcp __read_mostly = { .init = dctcp_init, .in_ack_event = dctcp_update_alpha, .cwnd_event = dctcp_cwnd_event, .ssthresh = dctcp_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = dctcp_cwnd_undo, .set_state = dctcp_state, .get_info = dctcp_get_info, .flags = TCP_CONG_NEEDS_ECN, .owner = THIS_MODULE, .name = "dctcp", }; static struct tcp_congestion_ops dctcp_reno __read_mostly = { .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = tcp_reno_undo_cwnd, .get_info = dctcp_get_info, .owner = THIS_MODULE, .name = "dctcp-reno", }; BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids) BTF_ID_FLAGS(func, dctcp_init) BTF_ID_FLAGS(func, dctcp_update_alpha) BTF_ID_FLAGS(func, dctcp_cwnd_event) BTF_ID_FLAGS(func, dctcp_ssthresh) BTF_ID_FLAGS(func, dctcp_cwnd_undo) BTF_ID_FLAGS(func, dctcp_state) BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids) static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { .owner = THIS_MODULE, .set = &tcp_dctcp_check_kfunc_ids, }; static int __init dctcp_register(void) { int ret; BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set); if (ret < 0) return ret; return tcp_register_congestion_control(&dctcp); } static void __exit dctcp_unregister(void) { tcp_unregister_congestion_control(&dctcp); } module_init(dctcp_register); module_exit(dctcp_unregister); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>"); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
2 2 2 2 2 5 5 5 5 3 3 7 7 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/mempool.c * * memory buffer pool support. Such pools are mostly used * for guaranteed, deadlock-free memory allocations during * extreme VM load. * * started by Ingo Molnar, Copyright (C) 2001 * debugging by David Rientjes, Copyright (C) 2015 */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/highmem.h> #include <linux/kasan.h> #include <linux/kmemleak.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/writeback.h> #include "slab.h" #ifdef CONFIG_SLUB_DEBUG_ON static void poison_error(mempool_t *pool, void *element, size_t size, size_t byte) { const int nr = pool->curr_nr; const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0); const int end = min_t(int, byte + (BITS_PER_LONG / 8), size); int i; pr_err("BUG: mempool element poison mismatch\n"); pr_err("Mempool %p size %zu\n", pool, size); pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : ""); for (i = start; i < end; i++) pr_cont("%x ", *(u8 *)(element + i)); pr_cont("%s\n", end < size ? "..." : ""); dump_stack(); } static void __check_element(mempool_t *pool, void *element, size_t size) { u8 *obj = element; size_t i; for (i = 0; i < size; i++) { u8 exp = (i < size - 1) ? POISON_FREE : POISON_END; if (obj[i] != exp) { poison_error(pool, element, size, i); return; } } memset(obj, POISON_INUSE, size); } static void check_element(mempool_t *pool, void *element) { /* Skip checking: KASAN might save its metadata in the element. */ if (kasan_enabled()) return; /* Mempools backed by slab allocator */ if (pool->free == mempool_kfree) { __check_element(pool, element, (size_t)pool->pool_data); } else if (pool->free == mempool_free_slab) { __check_element(pool, element, kmem_cache_size(pool->pool_data)); } else if (pool->free == mempool_free_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; void *addr = kmap_local_page((struct page *)element); __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); kunmap_local(addr); } } static void __poison_element(void *element, size_t size) { u8 *obj = element; memset(obj, POISON_FREE, size - 1); obj[size - 1] = POISON_END; } static void poison_element(mempool_t *pool, void *element) { /* Skip poisoning: KASAN might save its metadata in the element. */ if (kasan_enabled()) return; /* Mempools backed by slab allocator */ if (pool->alloc == mempool_kmalloc) { __poison_element(element, (size_t)pool->pool_data); } else if (pool->alloc == mempool_alloc_slab) { __poison_element(element, kmem_cache_size(pool->pool_data)); } else if (pool->alloc == mempool_alloc_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; void *addr = kmap_local_page((struct page *)element); __poison_element(addr, 1UL << (PAGE_SHIFT + order)); kunmap_local(addr); } } #else /* CONFIG_SLUB_DEBUG_ON */ static inline void check_element(mempool_t *pool, void *element) { } static inline void poison_element(mempool_t *pool, void *element) { } #endif /* CONFIG_SLUB_DEBUG_ON */ static __always_inline bool kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) return kasan_mempool_poison_object(element); else if (pool->alloc == mempool_alloc_pages) return kasan_mempool_poison_pages(element, (unsigned long)pool->pool_data); return true; } static void kasan_unpoison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_kmalloc) kasan_mempool_unpoison_object(element, (size_t)pool->pool_data); else if (pool->alloc == mempool_alloc_slab) kasan_mempool_unpoison_object(element, kmem_cache_size(pool->pool_data)); else if (pool->alloc == mempool_alloc_pages) kasan_mempool_unpoison_pages(element, (unsigned long)pool->pool_data); } static __always_inline void add_element(mempool_t *pool, void *element) { BUG_ON(pool->curr_nr >= pool->min_nr); poison_element(pool, element); if (kasan_poison_element(pool, element)) pool->elements[pool->curr_nr++] = element; } static void *remove_element(mempool_t *pool) { void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); kasan_unpoison_element(pool, element); check_element(pool, element); return element; } /** * mempool_exit - exit a mempool initialized with mempool_init() * @pool: pointer to the memory pool which was initialized with * mempool_init(). * * Free all reserved elements in @pool and @pool itself. This function * only sleeps if the free_fn() function sleeps. * * May be called on a zeroed but uninitialized mempool (i.e. allocated with * kzalloc()). */ void mempool_exit(mempool_t *pool) { while (pool->curr_nr) { void *element = remove_element(pool); pool->free(element, pool->pool_data); } kfree(pool->elements); pool->elements = NULL; } EXPORT_SYMBOL(mempool_exit); /** * mempool_destroy - deallocate a memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). * * Free all reserved elements in @pool and @pool itself. This function * only sleeps if the free_fn() function sleeps. */ void mempool_destroy(mempool_t *pool) { if (unlikely(!pool)) return; mempool_exit(pool); kfree(pool); } EXPORT_SYMBOL(mempool_destroy); int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int node_id) { spin_lock_init(&pool->lock); pool->min_nr = min_nr; pool->pool_data = pool_data; pool->alloc = alloc_fn; pool->free = free_fn; init_waitqueue_head(&pool->wait); pool->elements = kmalloc_array_node(min_nr, sizeof(void *), gfp_mask, node_id); if (!pool->elements) return -ENOMEM; /* * First pre-allocate the guaranteed number of buffers. */ while (pool->curr_nr < pool->min_nr) { void *element; element = pool->alloc(gfp_mask, pool->pool_data); if (unlikely(!element)) { mempool_exit(pool); return -ENOMEM; } add_element(pool, element); } return 0; } EXPORT_SYMBOL(mempool_init_node); /** * mempool_init - initialize a memory pool * @pool: pointer to the memory pool that should be initialized * @min_nr: the minimum number of elements guaranteed to be * allocated for this pool. * @alloc_fn: user-defined element-allocation function. * @free_fn: user-defined element-freeing function. * @pool_data: optional private data available to the user-defined functions. * * Like mempool_create(), but initializes the pool in (i.e. embedded in another * structure). * * Return: %0 on success, negative error code otherwise. */ int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) { return mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); } EXPORT_SYMBOL(mempool_init_noprof); /** * mempool_create_node - create a memory pool * @min_nr: the minimum number of elements guaranteed to be * allocated for this pool. * @alloc_fn: user-defined element-allocation function. * @free_fn: user-defined element-freeing function. * @pool_data: optional private data available to the user-defined functions. * @gfp_mask: memory allocation flags * @node_id: numa node to allocate on * * this function creates and allocates a guaranteed size, preallocated * memory pool. The pool can be used from the mempool_alloc() and mempool_free() * functions. This function might sleep. Both the alloc_fn() and the free_fn() * functions might sleep - as long as the mempool_alloc() function is not called * from IRQ contexts. * * Return: pointer to the created memory pool object or %NULL on error. */ mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int node_id) { mempool_t *pool; pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); if (!pool) return NULL; if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, gfp_mask, node_id)) { kfree(pool); return NULL; } return pool; } EXPORT_SYMBOL(mempool_create_node_noprof); /** * mempool_resize - resize an existing memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). * @new_min_nr: the new minimum number of elements guaranteed to be * allocated for this pool. * * This function shrinks/grows the pool. In the case of growing, * it cannot be guaranteed that the pool will be grown to the new * size immediately, but new mempool_free() calls will refill it. * This function may sleep. * * Note, the caller must guarantee that no mempool_destroy is called * while this function is running. mempool_alloc() & mempool_free() * might be called (eg. from IRQ contexts) while this function executes. * * Return: %0 on success, negative error code otherwise. */ int mempool_resize(mempool_t *pool, int new_min_nr) { void *element; void **new_elements; unsigned long flags; BUG_ON(new_min_nr <= 0); might_sleep(); spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { while (new_min_nr < pool->curr_nr) { element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); spin_lock_irqsave(&pool->lock, flags); } pool->min_nr = new_min_nr; goto out_unlock; } spin_unlock_irqrestore(&pool->lock, flags); /* Grow the pool */ new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements), GFP_KERNEL); if (!new_elements) return -ENOMEM; spin_lock_irqsave(&pool->lock, flags); if (unlikely(new_min_nr <= pool->min_nr)) { /* Raced, other resize will do our work */ spin_unlock_irqrestore(&pool->lock, flags); kfree(new_elements); goto out; } memcpy(new_elements, pool->elements, pool->curr_nr * sizeof(*new_elements)); kfree(pool->elements); pool->elements = new_elements; pool->min_nr = new_min_nr; while (pool->curr_nr < pool->min_nr) { spin_unlock_irqrestore(&pool->lock, flags); element = pool->alloc(GFP_KERNEL, pool->pool_data); if (!element) goto out; spin_lock_irqsave(&pool->lock, flags); if (pool->curr_nr < pool->min_nr) { add_element(pool, element); } else { spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); /* Raced */ goto out; } } out_unlock: spin_unlock_irqrestore(&pool->lock, flags); out: return 0; } EXPORT_SYMBOL(mempool_resize); /** * mempool_alloc - allocate an element from a specific memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). * @gfp_mask: the usual allocation bitmask. * * this function only sleeps if the alloc_fn() function sleeps or * returns NULL. Note that due to preallocation, this function * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) * Note: using __GFP_ZERO is not supported. * * Return: pointer to the allocated element or %NULL on error. */ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; wait_queue_entry_t wait; gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); might_alloc(gfp_mask); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: element = pool->alloc(gfp_temp, pool->pool_data); if (likely(element != NULL)) return element; spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); /* * Update the allocation stack trace as this is more useful * for debugging. */ kmemleak_update_trace(element); return element; } /* * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ if (gfp_temp != gfp_mask) { spin_unlock_irqrestore(&pool->lock, flags); gfp_temp = gfp_mask; goto repeat_alloc; } /* We must not sleep if !__GFP_DIRECT_RECLAIM */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { spin_unlock_irqrestore(&pool->lock, flags); return NULL; } /* Let's wait for someone else to return an element to @pool */ init_wait(&wait); prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); spin_unlock_irqrestore(&pool->lock, flags); /* * FIXME: this should be io_schedule(). The timeout is there as a * workaround for some DM problems in 2.6.18. */ io_schedule_timeout(5*HZ); finish_wait(&pool->wait, &wait); goto repeat_alloc; } EXPORT_SYMBOL(mempool_alloc_noprof); /** * mempool_alloc_preallocated - allocate an element from preallocated elements * belonging to a specific memory pool * @pool: pointer to the memory pool which was allocated via * mempool_create(). * * This function is similar to mempool_alloc, but it only attempts allocating * an element from the preallocated elements. It does not sleep and immediately * returns if no preallocated elements are available. * * Return: pointer to the allocated element or %NULL if no elements are * available. */ void *mempool_alloc_preallocated(mempool_t *pool) { void *element; unsigned long flags; spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); /* * Update the allocation stack trace as this is more useful * for debugging. */ kmemleak_update_trace(element); return element; } spin_unlock_irqrestore(&pool->lock, flags); return NULL; } EXPORT_SYMBOL(mempool_alloc_preallocated); /** * mempool_free - return an element to the pool. * @element: pool element pointer. * @pool: pointer to the memory pool which was allocated via * mempool_create(). * * this function only sleeps if the free_fn() function sleeps. */ void mempool_free(void *element, mempool_t *pool) { unsigned long flags; if (unlikely(element == NULL)) return; /* * Paired with the wmb in mempool_alloc(). The preceding read is * for @element and the following @pool->curr_nr. This ensures * that the visible value of @pool->curr_nr is from after the * allocation of @element. This is necessary for fringe cases * where @element was passed to this task without going through * barriers. * * For example, assume @p is %NULL at the beginning and one task * performs "p = mempool_alloc(...);" while another task is doing * "while (!p) cpu_relax(); mempool_free(p, ...);". This function * may end up using curr_nr value which is from before allocation * of @p without the following rmb. */ smp_rmb(); /* * For correctness, we need a test which is guaranteed to trigger * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr * without locking achieves that and refilling as soon as possible * is desirable. * * Because curr_nr visible here is always a value after the * allocation of @element, any task which decremented curr_nr below * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets * incremented to min_nr afterwards. If curr_nr gets incremented * to min_nr after the allocation of @element, the elements * allocated after that are subject to the same guarantee. * * Waiters happen iff curr_nr is 0 and the above guarantee also * ensures that there will be frees which return elements to the * pool waking up the waiters. */ if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr < pool->min_nr)) { add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); wake_up(&pool->wait); return; } spin_unlock_irqrestore(&pool->lock, flags); } pool->free(element, pool->pool_data); } EXPORT_SYMBOL(mempool_free); /* * A commonly used alloc and free fn. */ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) { struct kmem_cache *mem = pool_data; VM_BUG_ON(mem->ctor); return kmem_cache_alloc_noprof(mem, gfp_mask); } EXPORT_SYMBOL(mempool_alloc_slab); void mempool_free_slab(void *element, void *pool_data) { struct kmem_cache *mem = pool_data; kmem_cache_free(mem, element); } EXPORT_SYMBOL(mempool_free_slab); /* * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory * specified by pool_data */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { size_t size = (size_t)pool_data; return kmalloc_noprof(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); void mempool_kfree(void *element, void *pool_data) { kfree(element); } EXPORT_SYMBOL(mempool_kfree); void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) { size_t size = (size_t)pool_data; return kvmalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kvmalloc); void mempool_kvfree(void *element, void *pool_data) { kvfree(element); } EXPORT_SYMBOL(mempool_kvfree); /* * A simple mempool-backed page allocator that allocates pages * of the order specified by pool_data. */ void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) { int order = (int)(long)pool_data; return alloc_pages_noprof(gfp_mask, order); } EXPORT_SYMBOL(mempool_alloc_pages); void mempool_free_pages(void *element, void *pool_data) { int order = (int)(long)pool_data; __free_pages(element, order); } EXPORT_SYMBOL(mempool_free_pages);
361 361 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PAGE_H #define _ASM_X86_PAGE_H #include <linux/types.h> #ifdef __KERNEL__ #include <asm/page_types.h> #ifdef CONFIG_X86_64 #include <asm/page_64.h> #else #include <asm/page_32.h> #endif /* CONFIG_X86_64 */ #ifndef __ASSEMBLY__ struct page; #include <linux/range.h> extern struct range pfn_mapped[]; extern int nr_pfn_mapped; static inline void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { clear_page(page); } static inline void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *topage) { copy_page(to, from); } #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) #endif #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x)) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ /* * We need __phys_reloc_hide() here because gcc may assume that there is no * overflow during __pa() calculation and can optimize it unexpectedly. * Newer versions of gcc provide -fno-strict-overflow switch to handle this * case properly. Once all supported versions of gcc understand it, we can * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated) */ #define __pa_symbol(x) \ __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) #ifndef __va #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #endif #define __boot_va(x) __va(x) #define __boot_pa(x) __pa(x) /* * virt_to_page(kaddr) returns a valid pointer if and only if * virt_addr_valid(kaddr) returns true. */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) extern bool __virt_addr_valid(unsigned long kaddr); #define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr)) static __always_inline void *pfn_to_kaddr(unsigned long pfn) { return __va(pfn << PAGE_SHIFT); } static __always_inline u64 __canonical_address(u64 vaddr, u8 vaddr_bits) { return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); } static __always_inline u64 __is_canonical_address(u64 vaddr, u8 vaddr_bits) { return __canonical_address(vaddr, vaddr_bits) == vaddr; } #endif /* __ASSEMBLY__ */ #include <asm-generic/memory_model.h> #include <asm-generic/getorder.h> #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #endif /* __KERNEL__ */ #endif /* _ASM_X86_PAGE_H */
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 /* * llc_c_ev.c - Connection component state transition event qualifiers * * A 'state' consists of a number of possible event matching functions, * the actions associated with each being executed when that event is * matched; a 'state machine' accepts events in a serial fashion from an * event queue. Each event is passed to each successive event matching * function until a match is made (the event matching function returns * success, or '0') or the list of event matching functions is exhausted. * If a match is made, the actions associated with the event are executed * and the state is changed to that event's transition state. Before some * events are recognized, even after a match has been made, a certain * number of 'event qualifier' functions must also be executed. If these * all execute successfully, then the event is finally executed. * * These event functions must return 0 for success, to show a matched * event, of 1 if the event does not match. Event qualifier functions * must return a 0 for success or a non-zero for failure. Each function * is simply responsible for verifying one single thing and returning * either a success or failure. * * All of followed event functions are described in 802.2 LLC Protocol * standard document except two functions that we added that will explain * in their comments, at below. * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <net/llc_conn.h> #include <net/llc_sap.h> #include <net/sock.h> #include <net/llc_c_ac.h> #include <net/llc_c_ev.h> #include <net/llc_pdu.h> #if 1 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) #endif /** * llc_util_ns_inside_rx_window - check if sequence number is in rx window * @ns: sequence number of received pdu. * @vr: sequence number which receiver expects to receive. * @rw: receive window size of receiver. * * Checks if sequence number of received PDU is in range of receive * window. Returns 0 for success, 1 otherwise */ static u16 llc_util_ns_inside_rx_window(u8 ns, u8 vr, u8 rw) { return !llc_circular_between(vr, ns, (vr + rw - 1) % LLC_2_SEQ_NBR_MODULO); } /** * llc_util_nr_inside_tx_window - check if sequence number is in tx window * @sk: current connection. * @nr: N(R) of received PDU. * * This routine checks if N(R) of received PDU is in range of transmit * window; on the other hand checks if received PDU acknowledges some * outstanding PDUs that are in transmit window. Returns 0 for success, 1 * otherwise. */ static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr) { u8 nr1, nr2; struct sk_buff *skb; struct llc_pdu_sn *pdu; struct llc_sock *llc = llc_sk(sk); int rc = 0; if (llc->dev->flags & IFF_LOOPBACK) goto out; rc = 1; if (skb_queue_empty(&llc->pdu_unack_q)) goto out; skb = skb_peek(&llc->pdu_unack_q); pdu = llc_pdu_sn_hdr(skb); nr1 = LLC_I_GET_NS(pdu); skb = skb_peek_tail(&llc->pdu_unack_q); pdu = llc_pdu_sn_hdr(skb); nr2 = LLC_I_GET_NS(pdu); rc = !llc_circular_between(nr1, nr, (nr2 + 1) % LLC_2_SEQ_NBR_MODULO); out: return rc; } int llc_conn_ev_conn_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_CONN_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_data_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_DATA_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_disc_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_DISC_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_rst_req(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->prim == LLC_RESET_PRIM && ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; } int llc_conn_ev_local_busy_detected(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type == LLC_CONN_EV_TYPE_SIMPLE && ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_DETECTED ? 0 : 1; } int llc_conn_ev_local_busy_cleared(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type == LLC_CONN_EV_TYPE_SIMPLE && ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_CLEARED ? 0 : 1; } int llc_conn_ev_rx_bad_pdu(struct sock *sk, struct sk_buff *skb) { return 1; } int llc_conn_ev_rx_disc_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC ? 0 : 1; } int llc_conn_ev_rx_dm_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM ? 0 : 1; } int llc_conn_ev_rx_frmr_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn * pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); const u16 rc = LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; if (!rc) dprintk("%s: matched, state=%d, ns=%d, vr=%d\n", __func__, llc_sk(sk)->state, ns, vr); return rc; } int llc_conn_ev_rx_i_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_0(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; } int llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vr = llc_sk(sk)->vR; const u8 ns = LLC_I_GET_NS(pdu); const u16 rc = LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; if (!rc) dprintk("%s: matched, state=%d, ns=%d, vr=%d\n", __func__, llc_sk(sk)->state, ns, vr); return rc; } int llc_conn_ev_rx_rej_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; } int llc_conn_ev_rx_rej_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; } int llc_conn_ev_rx_rnr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1; } int llc_conn_ev_rx_rnr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1; } int llc_conn_ev_rx_rnr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1; } int llc_conn_ev_rx_rnr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1; } int llc_conn_ev_rx_rr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1; } int llc_conn_ev_rx_rr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1; } int llc_conn_ev_rx_rr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_0(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1; } int llc_conn_ev_rx_rr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); return llc_conn_space(sk, skb) && LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && LLC_S_PF_IS_1(pdu) && LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1; } int llc_conn_ev_rx_sabme_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) { const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME ? 0 : 1; } int llc_conn_ev_rx_ua_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_UA ? 0 : 1; } int llc_conn_ev_rx_xxx_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if (LLC_PDU_IS_CMD(pdu)) { if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) { if (LLC_I_PF_IS_1(pdu)) rc = 0; } else if (LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PF_IS_1(pdu)) rc = 0; } return rc; } int llc_conn_ev_rx_xxx_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); if (LLC_PDU_IS_CMD(pdu)) { if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) rc = 0; else if (LLC_PDU_TYPE_IS_U(pdu)) switch (LLC_U_PDU_CMD(pdu)) { case LLC_2_PDU_CMD_SABME: case LLC_2_PDU_CMD_DISC: rc = 0; break; } } return rc; } int llc_conn_ev_rx_xxx_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); if (LLC_PDU_IS_RSP(pdu)) { if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) rc = 0; else if (LLC_PDU_TYPE_IS_U(pdu)) switch (LLC_U_PDU_RSP(pdu)) { case LLC_2_PDU_RSP_UA: case LLC_2_PDU_RSP_DM: case LLC_2_PDU_RSP_FRMR: rc = 0; break; } } return rc; } int llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vs = llc_sk(sk)->vS; const u8 nr = LLC_I_GET_NR(pdu); if (LLC_PDU_IS_CMD(pdu) && (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) && nr != vs && llc_util_nr_inside_tx_window(sk, nr)) { dprintk("%s: matched, state=%d, vs=%d, nr=%d\n", __func__, llc_sk(sk)->state, vs, nr); rc = 0; } return rc; } int llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr(struct sock *sk, struct sk_buff *skb) { u16 rc = 1; const struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); const u8 vs = llc_sk(sk)->vS; const u8 nr = LLC_I_GET_NR(pdu); if (LLC_PDU_IS_RSP(pdu) && (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) && nr != vs && llc_util_nr_inside_tx_window(sk, nr)) { rc = 0; dprintk("%s: matched, state=%d, vs=%d, nr=%d\n", __func__, llc_sk(sk)->state, vs, nr); } return rc; } int llc_conn_ev_rx_any_frame(struct sock *sk, struct sk_buff *skb) { return 0; } int llc_conn_ev_p_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_P_TMR; } int llc_conn_ev_ack_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_ACK_TMR; } int llc_conn_ev_rej_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_REJ_TMR; } int llc_conn_ev_busy_tmr_exp(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type != LLC_CONN_EV_TYPE_BUSY_TMR; } int llc_conn_ev_init_p_f_cycle(struct sock *sk, struct sk_buff *skb) { return 1; } int llc_conn_ev_tx_buffer_full(struct sock *sk, struct sk_buff *skb) { const struct llc_conn_state_ev *ev = llc_conn_ev(skb); return ev->type == LLC_CONN_EV_TYPE_SIMPLE && ev->prim_type == LLC_CONN_EV_TX_BUFF_FULL ? 0 : 1; } /* Event qualifier functions * * these functions simply verify the value of a state flag associated with * the connection and return either a 0 for success or a non-zero value * for not-success; verify the event is the type we expect */ int llc_conn_ev_qlfy_data_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->data_flag != 1; } int llc_conn_ev_qlfy_data_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->data_flag; } int llc_conn_ev_qlfy_data_flag_eq_2(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->data_flag != 2; } int llc_conn_ev_qlfy_p_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->p_flag != 1; } /** * llc_conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window * @sk: current connection structure. * @skb: current event. * * This function determines when frame which is sent, is last frame of * transmit window, if it is then this function return zero else return * one. This function is used for sending last frame of transmit window * as I-format command with p-bit set to one. Returns 0 if frame is last * frame, 1 otherwise. */ int llc_conn_ev_qlfy_last_frame_eq_1(struct sock *sk, struct sk_buff *skb) { return !(skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k); } /** * llc_conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx window * @sk: current connection structure. * @skb: current event. * * This function determines when frame which is sent, isn't last frame of * transmit window, if it isn't then this function return zero else return * one. Returns 0 if frame isn't last frame, 1 otherwise. */ int llc_conn_ev_qlfy_last_frame_eq_0(struct sock *sk, struct sk_buff *skb) { return skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k; } int llc_conn_ev_qlfy_p_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->p_flag; } int llc_conn_ev_qlfy_p_flag_eq_f(struct sock *sk, struct sk_buff *skb) { u8 f_bit; llc_pdu_decode_pf_bit(skb, &f_bit); return llc_sk(sk)->p_flag == f_bit ? 0 : 1; } int llc_conn_ev_qlfy_remote_busy_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->remote_busy_flag; } int llc_conn_ev_qlfy_remote_busy_eq_1(struct sock *sk, struct sk_buff *skb) { return !llc_sk(sk)->remote_busy_flag; } int llc_conn_ev_qlfy_retry_cnt_lt_n2(struct sock *sk, struct sk_buff *skb) { return !(llc_sk(sk)->retry_count < llc_sk(sk)->n2); } int llc_conn_ev_qlfy_retry_cnt_gte_n2(struct sock *sk, struct sk_buff *skb) { return !(llc_sk(sk)->retry_count >= llc_sk(sk)->n2); } int llc_conn_ev_qlfy_s_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return !llc_sk(sk)->s_flag; } int llc_conn_ev_qlfy_s_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->s_flag; } int llc_conn_ev_qlfy_cause_flag_eq_1(struct sock *sk, struct sk_buff *skb) { return !llc_sk(sk)->cause_flag; } int llc_conn_ev_qlfy_cause_flag_eq_0(struct sock *sk, struct sk_buff *skb) { return llc_sk(sk)->cause_flag; } int llc_conn_ev_qlfy_set_status_conn(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_CONN; return 0; } int llc_conn_ev_qlfy_set_status_disc(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_DISC; return 0; } int llc_conn_ev_qlfy_set_status_failed(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_FAILED; return 0; } int llc_conn_ev_qlfy_set_status_remote_busy(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_REMOTE_BUSY; return 0; } int llc_conn_ev_qlfy_set_status_refuse(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_REFUSE; return 0; } int llc_conn_ev_qlfy_set_status_conflict(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_CONFLICT; return 0; } int llc_conn_ev_qlfy_set_status_rst_done(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->status = LLC_STATUS_RESET_DONE; return 0; }
5 5 5 5 5 5 5 13 13 13 13 13 2 2 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 // SPDX-License-Identifier: GPL-2.0 /* * Tag allocation using scalable bitmaps. Uses active queue tracking to support * fairer distribution of tags between multiple submitters when a shared tag map * is used. * * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/delay.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" /* * Recalculate wakeup batch when tag is shared by hctx. */ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, unsigned int users) { if (!users) return; sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags, users); sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags, users); } /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail * to get tag when first time, the other shared-tag users could reserve * budget for it. */ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { unsigned int users; unsigned long flags; struct blk_mq_tags *tags = hctx->tags; /* * calling test_bit() prior to test_and_set_bit() is intentional, * it avoids dirtying the cacheline if the queue is already active. */ if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; } else { if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; } spin_lock_irqsave(&tags->lock, flags); users = tags->active_queues + 1; WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); spin_unlock_irqrestore(&tags->lock, flags); } /* * Wakeup all potentially sleeping on tags */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { sbitmap_queue_wake_all(&tags->bitmap_tags); if (include_reserve) sbitmap_queue_wake_all(&tags->breserved_tags); } /* * If a previously busy queue goes inactive, potential waiters could now * be allowed to queue. Wake them up and check. */ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; unsigned int users; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; } else { if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; } spin_lock_irq(&tags->lock); users = tags->active_queues - 1; WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); spin_unlock_irq(&tags->lock); blk_mq_tag_wakeup_all(tags, false); } static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && !hctx_may_queue(data->hctx, bt)) return BLK_MQ_NO_TAG; if (data->shallow_depth) return sbitmap_queue_get_shallow(bt, data->shallow_depth); else return __sbitmap_queue_get(bt); } unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, unsigned int *offset) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct sbitmap_queue *bt = &tags->bitmap_tags; unsigned long ret; if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED || data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) return 0; ret = __sbitmap_queue_get_batch(bt, nr_tags, offset); *offset += tags->nr_reserved_tags; return ret; } unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct sbitmap_queue *bt; struct sbq_wait_state *ws; DEFINE_SBQ_WAIT(wait); unsigned int tag_offset; int tag; if (data->flags & BLK_MQ_REQ_RESERVED) { if (unlikely(!tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_NO_TAG; } bt = &tags->breserved_tags; tag_offset = 0; } else { bt = &tags->bitmap_tags; tag_offset = tags->nr_reserved_tags; } tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) goto found_tag; if (data->flags & BLK_MQ_REQ_NOWAIT) return BLK_MQ_NO_TAG; ws = bt_wait_ptr(bt, data->hctx); do { struct sbitmap_queue *bt_prev; /* * We're out of tags on this hardware queue, kick any * pending IO submits before going to sleep waiting for * some to complete. */ blk_mq_run_hw_queue(data->hctx, false); /* * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) break; sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) break; bt_prev = bt; io_schedule(); sbitmap_finish_wait(bt, ws, &wait); data->ctx = blk_mq_get_ctx(data->q); data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) bt = &tags->breserved_tags; else bt = &tags->bitmap_tags; /* * If destination hw queue is changed, fake wake up on * previous queue for compensating the wake up miss, so * other allocations on previous queue won't be starved. */ if (bt != bt_prev) sbitmap_queue_wake_up(bt_prev, 1); ws = bt_wait_ptr(bt, data->hctx); } while (1); sbitmap_finish_wait(bt, ws, &wait); found_tag: /* * Give up this allocation if the hctx is inactive. The caller will * retry on an active hctx. */ if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { blk_mq_put_tag(tags, data->ctx, tag + tag_offset); return BLK_MQ_NO_TAG; } return tag + tag_offset; } void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag) { if (!blk_mq_tag_is_reserved(tags, tag)) { const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); } else { sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); } } void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags) { sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags, tag_array, nr_tags); } struct bt_iter_data { struct blk_mq_hw_ctx *hctx; struct request_queue *q; busy_tag_iter_fn *fn; void *data; bool reserved; }; static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, unsigned int bitnr) { struct request *rq; unsigned long flags; spin_lock_irqsave(&tags->lock, flags); rq = tags->rqs[bitnr]; if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq)) rq = NULL; spin_unlock_irqrestore(&tags->lock, flags); return rq; } static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; struct blk_mq_hw_ctx *hctx = iter_data->hctx; struct request_queue *q = iter_data->q; struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_tags *tags; struct request *rq; bool ret = true; if (blk_mq_is_shared_tags(set->flags)) tags = set->shared_tags; else tags = hctx->tags; if (!iter_data->reserved) bitnr += tags->nr_reserved_tags; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ rq = blk_mq_find_and_get_req(tags, bitnr); if (!rq) return true; if (rq->q == q && (!hctx || rq->mq_hctx == hctx)) ret = iter_data->fn(rq, iter_data->data); blk_mq_put_rq_ref(rq); return ret; } /** * bt_for_each - iterate over the requests associated with a hardware queue * @hctx: Hardware queue to examine. * @q: Request queue to examine. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each request * associated with @hctx that has been assigned a driver tag. * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) * where rq is a pointer to a request. Return true to continue * iterating tags, false to stop. * @data: Will be passed as third argument to @fn. * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. */ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q, struct sbitmap_queue *bt, busy_tag_iter_fn *fn, void *data, bool reserved) { struct bt_iter_data iter_data = { .hctx = hctx, .fn = fn, .data = data, .reserved = reserved, .q = q, }; sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); } struct bt_tags_iter_data { struct blk_mq_tags *tags; busy_tag_iter_fn *fn; void *data; unsigned int flags; }; #define BT_TAG_ITER_RESERVED (1 << 0) #define BT_TAG_ITER_STARTED (1 << 1) #define BT_TAG_ITER_STATIC_RQS (1 << 2) static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_tags_iter_data *iter_data = data; struct blk_mq_tags *tags = iter_data->tags; struct request *rq; bool ret = true; bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS); if (!(iter_data->flags & BT_TAG_ITER_RESERVED)) bitnr += tags->nr_reserved_tags; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ if (iter_static_rqs) rq = tags->static_rqs[bitnr]; else rq = blk_mq_find_and_get_req(tags, bitnr); if (!rq) return true; if (!(iter_data->flags & BT_TAG_ITER_STARTED) || blk_mq_request_started(rq)) ret = iter_data->fn(rq, iter_data->data); if (!iter_static_rqs) blk_mq_put_rq_ref(rq); return ret; } /** * bt_tags_for_each - iterate over the requests in a tag map * @tags: Tag map to iterate over. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @data, * @reserved) where rq is a pointer to a request. Return true * to continue iterating tags, false to stop. * @data: Will be passed as second argument to @fn. * @flags: BT_TAG_ITER_* */ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, busy_tag_iter_fn *fn, void *data, unsigned int flags) { struct bt_tags_iter_data iter_data = { .tags = tags, .fn = fn, .data = data, .flags = flags, }; if (tags->rqs) sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); } static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv, unsigned int flags) { WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); if (tags->nr_reserved_tags) bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, flags | BT_TAG_ITER_RESERVED); bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags); } /** * blk_mq_all_tag_iter - iterate over all requests in a tag map * @tags: Tag map to iterate over. * @fn: Pointer to the function that will be called for each * request. @fn will be called as follows: @fn(rq, @priv, * reserved) where rq is a pointer to a request. 'reserved' * indicates whether or not @rq is a reserved request. Return * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. * * Caller has to pass the tag map from which requests are allocated. */ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv) { __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS); } /** * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set * @tagset: Tag set to iterate over. * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @priv, * reserved) where rq is a pointer to a request. 'reserved' * indicates whether or not @rq is a reserved request. Return * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. * * We grab one request reference before calling @fn and release it after * @fn returns. */ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { unsigned int flags = tagset->flags; int i, nr_tags; nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues; for (i = 0; i < nr_tags; i++) { if (tagset->tags && tagset->tags[i]) __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, BT_TAG_ITER_STARTED); } } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data) { unsigned *count = data; if (blk_mq_request_completed(rq)) (*count)++; return true; } /** * blk_mq_tagset_wait_completed_request - Wait until all scheduled request * completions have finished. * @tagset: Tag set to drain completed request * * Note: This function has to be run after all IO queues are shutdown */ void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) { while (true) { unsigned count = 0; blk_mq_tagset_busy_iter(tagset, blk_mq_tagset_count_completed_rqs, &count); if (!count) break; msleep(5); } } EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); /** * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag * @q: Request queue to examine. * @fn: Pointer to the function that will be called for each request * on @q. @fn will be called as follows: @fn(hctx, rq, @priv, * reserved) where rq is a pointer to a request and hctx points * to the hardware queue associated with the request. 'reserved' * indicates whether or not @rq is a reserved request. * @priv: Will be passed as third argument to @fn. * * Note: if @q->tag_set is shared with other request queues then @fn will be * called for all requests on all queues that share that tag set and not only * for requests associated with @q. */ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { /* * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table * while the queue is frozen. So we can use q_usage_counter to avoid * racing with it. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return; if (blk_mq_is_shared_tags(q->tag_set->flags)) { struct blk_mq_tags *tags = q->tag_set->shared_tags; struct sbitmap_queue *bresv = &tags->breserved_tags; struct sbitmap_queue *btags = &tags->bitmap_tags; if (tags->nr_reserved_tags) bt_for_each(NULL, q, bresv, fn, priv, true); bt_for_each(NULL, q, btags, fn, priv, false); } else { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; struct sbitmap_queue *bresv = &tags->breserved_tags; struct sbitmap_queue *btags = &tags->bitmap_tags; /* * If no software queues are currently mapped to this * hardware queue, there's nothing to check */ if (!blk_mq_hw_queue_mapped(hctx)) continue; if (tags->nr_reserved_tags) bt_for_each(hctx, q, bresv, fn, priv, true); bt_for_each(hctx, q, btags, fn, priv, false); } } blk_queue_exit(q); } static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, bool round_robin, int node) { return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, node); } int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, struct sbitmap_queue *breserved_tags, unsigned int queue_depth, unsigned int reserved, int node, int alloc_policy) { unsigned int depth = queue_depth - reserved; bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; if (bt_alloc(bitmap_tags, depth, round_robin, node)) return -ENOMEM; if (bt_alloc(breserved_tags, reserved, round_robin, node)) goto free_bitmap_tags; return 0; free_bitmap_tags: sbitmap_queue_free(bitmap_tags); return -ENOMEM; } struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, int node, int alloc_policy) { struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { pr_err("blk-mq: tag depth too large\n"); return NULL; } tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); if (!tags) return NULL; tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags, total_tags, reserved_tags, node, alloc_policy) < 0) { kfree(tags); return NULL; } return tags; } void blk_mq_free_tags(struct blk_mq_tags *tags) { sbitmap_queue_free(&tags->bitmap_tags); sbitmap_queue_free(&tags->breserved_tags); kfree(tags); } int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tagsptr, unsigned int tdepth, bool can_grow) { struct blk_mq_tags *tags = *tagsptr; if (tdepth <= tags->nr_reserved_tags) return -EINVAL; /* * If we are allowed to grow beyond the original size, allocate * a new set of tags before freeing the old one. */ if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; struct blk_mq_tags *new; if (!can_grow) return -EINVAL; /* * We need some sort of upper limit, set it high enough that * no valid use cases should require more. */ if (tdepth > MAX_SCHED_RQ) return -EINVAL; /* * Only the sbitmap needs resizing since we allocated the max * initially. */ if (blk_mq_is_shared_tags(set->flags)) return 0; new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth); if (!new) return -ENOMEM; blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num); *tagsptr = new; } else { /* * Don't need (or can't) update reserved tags here, they * remain static and should never need resizing. */ sbitmap_queue_resize(&tags->bitmap_tags, tdepth - tags->nr_reserved_tags); } return 0; } void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size) { struct blk_mq_tags *tags = set->shared_tags; sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags); } void blk_mq_tag_update_sched_shared_tags(struct request_queue *q) { sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags, q->nr_requests - q->tag_set->reserved_tags); } /** * blk_mq_unique_tag() - return a tag that is unique queue-wide * @rq: request for which to compute a unique tag * * The tag field in struct request is unique per hardware queue but not over * all hardware queues. Hence this function that returns a tag with the * hardware context index in the upper bits and the per hardware queue tag in * the lower bits. * * Note: When called for a request that is queued on a non-multiqueue request * queue, the hardware context index is set to zero. */ u32 blk_mq_unique_tag(struct request *rq) { return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) | (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); } EXPORT_SYMBOL(blk_mq_unique_tag);
2 2 2 2 2 1 3 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 // SPDX-License-Identifier: GPL-2.0 /* * Tty port functions */ #include <linux/types.h> #include <linux/errno.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/serial.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/serdev.h> #include "tty.h" static size_t tty_port_default_receive_buf(struct tty_port *port, const u8 *p, const u8 *f, size_t count) { struct tty_struct *tty; struct tty_ldisc *ld; tty = READ_ONCE(port->itty); if (!tty) return 0; ld = tty_ldisc_ref(tty); if (!ld) return 0; count = tty_ldisc_receive_buf(ld, p, f, count); tty_ldisc_deref(ld); return count; } static void tty_port_default_lookahead_buf(struct tty_port *port, const u8 *p, const u8 *f, size_t count) { struct tty_struct *tty; struct tty_ldisc *ld; tty = READ_ONCE(port->itty); if (!tty) return; ld = tty_ldisc_ref(tty); if (!ld) return; if (ld->ops->lookahead_buf) ld->ops->lookahead_buf(ld->tty, p, f, count); tty_ldisc_deref(ld); } static void tty_port_default_wakeup(struct tty_port *port) { struct tty_struct *tty = tty_port_tty_get(port); if (tty) { tty_wakeup(tty); tty_kref_put(tty); } } const struct tty_port_client_operations tty_port_default_client_ops = { .receive_buf = tty_port_default_receive_buf, .lookahead_buf = tty_port_default_lookahead_buf, .write_wakeup = tty_port_default_wakeup, }; EXPORT_SYMBOL_GPL(tty_port_default_client_ops); /** * tty_port_init - initialize tty_port * @port: tty_port to initialize * * Initializes the state of struct tty_port. When a port was initialized using * this function, one has to destroy the port by tty_port_destroy(). Either * indirectly by using &tty_port refcounting (tty_port_put()) or directly if * refcounting is not used. */ void tty_port_init(struct tty_port *port) { memset(port, 0, sizeof(*port)); tty_buffer_init(port); init_waitqueue_head(&port->open_wait); init_waitqueue_head(&port->delta_msr_wait); mutex_init(&port->mutex); mutex_init(&port->buf_mutex); spin_lock_init(&port->lock); port->close_delay = (50 * HZ) / 100; port->closing_wait = (3000 * HZ) / 100; port->client_ops = &tty_port_default_client_ops; kref_init(&port->kref); } EXPORT_SYMBOL(tty_port_init); /** * tty_port_link_device - link tty and tty_port * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * * Provide the tty layer with a link from a tty (specified by @index) to a * tty_port (@port). Use this only if neither tty_port_register_device() nor * tty_port_install() is used in the driver. If used, this has to be called * before tty_register_driver(). */ void tty_port_link_device(struct tty_port *port, struct tty_driver *driver, unsigned index) { if (WARN_ON(index >= driver->num)) return; driver->ports[index] = port; } EXPORT_SYMBOL_GPL(tty_port_link_device); /** * tty_port_register_device - register tty device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @device: parent if exists, otherwise NULL * * It is the same as tty_register_device() except the provided @port is linked * to a concrete tty specified by @index. Use this or tty_port_install() (or * both). Call tty_port_link_device() as a last resort. */ struct device *tty_port_register_device(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device) { return tty_port_register_device_attr(port, driver, index, device, NULL, NULL); } EXPORT_SYMBOL_GPL(tty_port_register_device); /** * tty_port_register_device_attr - register tty device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @device: parent if exists, otherwise NULL * @drvdata: Driver data to be set to device. * @attr_grp: Attribute group to be set on device. * * It is the same as tty_register_device_attr() except the provided @port is * linked to a concrete tty specified by @index. Use this or tty_port_install() * (or both). Call tty_port_link_device() as a last resort. */ struct device *tty_port_register_device_attr(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp) { tty_port_link_device(port, driver, index); return tty_register_device_attr(driver, index, device, drvdata, attr_grp); } EXPORT_SYMBOL_GPL(tty_port_register_device_attr); /** * tty_port_register_device_attr_serdev - register tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @host: serial port hardware device * @parent: parent if exists, otherwise NULL * @drvdata: driver data for the device * @attr_grp: attribute group for the device * * Register a serdev or tty device depending on if the parent device has any * defined serdev clients or not. */ struct device *tty_port_register_device_attr_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *host, struct device *parent, void *drvdata, const struct attribute_group **attr_grp) { struct device *dev; tty_port_link_device(port, driver, index); dev = serdev_tty_port_register(port, host, parent, driver, index); if (PTR_ERR(dev) != -ENODEV) { /* Skip creating cdev if we registered a serdev device */ return dev; } return tty_register_device_attr(driver, index, parent, drvdata, attr_grp); } EXPORT_SYMBOL_GPL(tty_port_register_device_attr_serdev); /** * tty_port_register_device_serdev - register tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @host: serial port hardware controller device * @parent: parent if exists, otherwise NULL * * Register a serdev or tty device depending on if the parent device has any * defined serdev clients or not. */ struct device *tty_port_register_device_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *host, struct device *parent) { return tty_port_register_device_attr_serdev(port, driver, index, host, parent, NULL, NULL); } EXPORT_SYMBOL_GPL(tty_port_register_device_serdev); /** * tty_port_unregister_device - deregister a tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * * If a tty or serdev device is registered with a call to * tty_port_register_device_serdev() then this function must be called when * the device is gone. */ void tty_port_unregister_device(struct tty_port *port, struct tty_driver *driver, unsigned index) { int ret; ret = serdev_tty_port_unregister(port); if (ret == 0) return; tty_unregister_device(driver, index); } EXPORT_SYMBOL_GPL(tty_port_unregister_device); int tty_port_alloc_xmit_buf(struct tty_port *port) { /* We may sleep in get_zeroed_page() */ mutex_lock(&port->buf_mutex); if (port->xmit_buf == NULL) { port->xmit_buf = (u8 *)get_zeroed_page(GFP_KERNEL); if (port->xmit_buf) kfifo_init(&port->xmit_fifo, port->xmit_buf, PAGE_SIZE); } mutex_unlock(&port->buf_mutex); if (port->xmit_buf == NULL) return -ENOMEM; return 0; } EXPORT_SYMBOL(tty_port_alloc_xmit_buf); void tty_port_free_xmit_buf(struct tty_port *port) { mutex_lock(&port->buf_mutex); free_page((unsigned long)port->xmit_buf); port->xmit_buf = NULL; INIT_KFIFO(port->xmit_fifo); mutex_unlock(&port->buf_mutex); } EXPORT_SYMBOL(tty_port_free_xmit_buf); /** * tty_port_destroy - destroy inited port * @port: tty port to be destroyed * * When a port was initialized using tty_port_init(), one has to destroy the * port by this function. Either indirectly by using &tty_port refcounting * (tty_port_put()) or directly if refcounting is not used. */ void tty_port_destroy(struct tty_port *port) { tty_buffer_cancel_work(port); tty_buffer_free_all(port); } EXPORT_SYMBOL(tty_port_destroy); static void tty_port_destructor(struct kref *kref) { struct tty_port *port = container_of(kref, struct tty_port, kref); /* check if last port ref was dropped before tty release */ if (WARN_ON(port->itty)) return; free_page((unsigned long)port->xmit_buf); tty_port_destroy(port); if (port->ops && port->ops->destruct) port->ops->destruct(port); else kfree(port); } /** * tty_port_put - drop a reference to tty_port * @port: port to drop a reference of (can be NULL) * * The final put will destroy and free up the @port using * @port->ops->destruct() hook, or using kfree() if not provided. */ void tty_port_put(struct tty_port *port) { if (port) kref_put(&port->kref, tty_port_destructor); } EXPORT_SYMBOL(tty_port_put); /** * tty_port_tty_get - get a tty reference * @port: tty port * * Return a refcount protected tty instance or %NULL if the port is not * associated with a tty (eg due to close or hangup). */ struct tty_struct *tty_port_tty_get(struct tty_port *port) { unsigned long flags; struct tty_struct *tty; spin_lock_irqsave(&port->lock, flags); tty = tty_kref_get(port->tty); spin_unlock_irqrestore(&port->lock, flags); return tty; } EXPORT_SYMBOL(tty_port_tty_get); /** * tty_port_tty_set - set the tty of a port * @port: tty port * @tty: the tty * * Associate the port and tty pair. Manages any internal refcounts. Pass %NULL * to deassociate a port. */ void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&port->lock, flags); tty_kref_put(port->tty); port->tty = tty_kref_get(tty); spin_unlock_irqrestore(&port->lock, flags); } EXPORT_SYMBOL(tty_port_tty_set); /** * tty_port_shutdown - internal helper to shutdown the device * @port: tty port to be shut down * @tty: the associated tty * * It is used by tty_port_hangup() and tty_port_close(). Its task is to * shutdown the device if it was initialized (note consoles remain * functioning). It lowers DTR/RTS (if @tty has HUPCL set) and invokes * @port->ops->shutdown(). */ static void tty_port_shutdown(struct tty_port *port, struct tty_struct *tty) { mutex_lock(&port->mutex); if (port->console) goto out; if (tty_port_initialized(port)) { tty_port_set_initialized(port, false); /* * Drop DTR/RTS if HUPCL is set. This causes any attached * modem to hang up the line. */ if (tty && C_HUPCL(tty)) tty_port_lower_dtr_rts(port); if (port->ops->shutdown) port->ops->shutdown(port); } out: mutex_unlock(&port->mutex); } /** * tty_port_hangup - hangup helper * @port: tty port * * Perform port level tty hangup flag and count changes. Drop the tty * reference. * * Caller holds tty lock. */ void tty_port_hangup(struct tty_port *port) { struct tty_struct *tty; unsigned long flags; spin_lock_irqsave(&port->lock, flags); port->count = 0; tty = port->tty; if (tty) set_bit(TTY_IO_ERROR, &tty->flags); port->tty = NULL; spin_unlock_irqrestore(&port->lock, flags); tty_port_set_active(port, false); tty_port_shutdown(port, tty); tty_kref_put(tty); wake_up_interruptible(&port->open_wait); wake_up_interruptible(&port->delta_msr_wait); } EXPORT_SYMBOL(tty_port_hangup); /** * tty_port_tty_hangup - helper to hang up a tty * @port: tty port * @check_clocal: hang only ttys with %CLOCAL unset? */ void tty_port_tty_hangup(struct tty_port *port, bool check_clocal) { struct tty_struct *tty = tty_port_tty_get(port); if (tty && (!check_clocal || !C_CLOCAL(tty))) tty_hangup(tty); tty_kref_put(tty); } EXPORT_SYMBOL_GPL(tty_port_tty_hangup); /** * tty_port_tty_wakeup - helper to wake up a tty * @port: tty port */ void tty_port_tty_wakeup(struct tty_port *port) { port->client_ops->write_wakeup(port); } EXPORT_SYMBOL_GPL(tty_port_tty_wakeup); /** * tty_port_carrier_raised - carrier raised check * @port: tty port * * Wrapper for the carrier detect logic. For the moment this is used * to hide some internal details. This will eventually become entirely * internal to the tty port. */ bool tty_port_carrier_raised(struct tty_port *port) { if (port->ops->carrier_raised == NULL) return true; return port->ops->carrier_raised(port); } EXPORT_SYMBOL(tty_port_carrier_raised); /** * tty_port_raise_dtr_rts - Raise DTR/RTS * @port: tty port * * Wrapper for the DTR/RTS raise logic. For the moment this is used to hide * some internal details. This will eventually become entirely internal to the * tty port. */ void tty_port_raise_dtr_rts(struct tty_port *port) { if (port->ops->dtr_rts) port->ops->dtr_rts(port, true); } EXPORT_SYMBOL(tty_port_raise_dtr_rts); /** * tty_port_lower_dtr_rts - Lower DTR/RTS * @port: tty port * * Wrapper for the DTR/RTS raise logic. For the moment this is used to hide * some internal details. This will eventually become entirely internal to the * tty port. */ void tty_port_lower_dtr_rts(struct tty_port *port) { if (port->ops->dtr_rts) port->ops->dtr_rts(port, false); } EXPORT_SYMBOL(tty_port_lower_dtr_rts); /** * tty_port_block_til_ready - Waiting logic for tty open * @port: the tty port being opened * @tty: the tty device being bound * @filp: the file pointer of the opener or %NULL * * Implement the core POSIX/SuS tty behaviour when opening a tty device. * Handles: * * - hangup (both before and during) * - non blocking open * - rts/dtr/dcd * - signals * - port flags and counts * * The passed @port must implement the @port->ops->carrier_raised method if it * can do carrier detect and the @port->ops->dtr_rts method if it supports * software management of these lines. Note that the dtr/rts raise is done each * iteration as a hangup may have previously dropped them while we wait. * * Caller holds tty lock. * * Note: May drop and reacquire tty lock when blocking, so @tty and @port may * have changed state (eg., may have been hung up). */ int tty_port_block_til_ready(struct tty_port *port, struct tty_struct *tty, struct file *filp) { int do_clocal = 0, retval; unsigned long flags; DEFINE_WAIT(wait); /* if non-blocking mode is set we can pass directly to open unless * the port has just hung up or is in another error state. */ if (tty_io_error(tty)) { tty_port_set_active(port, true); return 0; } if (filp == NULL || (filp->f_flags & O_NONBLOCK)) { /* Indicate we are open */ if (C_BAUD(tty)) tty_port_raise_dtr_rts(port); tty_port_set_active(port, true); return 0; } if (C_CLOCAL(tty)) do_clocal = 1; /* Block waiting until we can proceed. We may need to wait for the * carrier, but we must also wait for any close that is in progress * before the next open may complete. */ retval = 0; /* The port lock protects the port counts */ spin_lock_irqsave(&port->lock, flags); port->count--; port->blocked_open++; spin_unlock_irqrestore(&port->lock, flags); while (1) { /* Indicate we are open */ if (C_BAUD(tty) && tty_port_initialized(port)) tty_port_raise_dtr_rts(port); prepare_to_wait(&port->open_wait, &wait, TASK_INTERRUPTIBLE); /* Check for a hangup or uninitialised port. * Return accordingly. */ if (tty_hung_up_p(filp) || !tty_port_initialized(port)) { if (port->flags & ASYNC_HUP_NOTIFY) retval = -EAGAIN; else retval = -ERESTARTSYS; break; } /* * Probe the carrier. For devices with no carrier detect * tty_port_carrier_raised will always return true. * Never ask drivers if CLOCAL is set, this causes troubles * on some hardware. */ if (do_clocal || tty_port_carrier_raised(port)) break; if (signal_pending(current)) { retval = -ERESTARTSYS; break; } tty_unlock(tty); schedule(); tty_lock(tty); } finish_wait(&port->open_wait, &wait); /* Update counts. A parallel hangup will have set count to zero and * we must not mess that up further. */ spin_lock_irqsave(&port->lock, flags); if (!tty_hung_up_p(filp)) port->count++; port->blocked_open--; spin_unlock_irqrestore(&port->lock, flags); if (retval == 0) tty_port_set_active(port, true); return retval; } EXPORT_SYMBOL(tty_port_block_til_ready); static void tty_port_drain_delay(struct tty_port *port, struct tty_struct *tty) { unsigned int bps = tty_get_baud_rate(tty); long timeout; if (bps > 1200) { timeout = (HZ * 10 * port->drain_delay) / bps; timeout = max_t(long, timeout, HZ / 10); } else { timeout = 2 * HZ; } schedule_timeout_interruptible(timeout); } /** * tty_port_close_start - helper for tty->ops->close, part 1/2 * @port: tty_port of the device * @tty: tty being closed * @filp: passed file pointer * * Decrements and checks open count. Flushes the port if this is the last * close. That means, dropping the data from the outpu buffer on the device and * waiting for sending logic to finish. The rest of close handling is performed * in tty_port_close_end(). * * Locking: Caller holds tty lock. * * Return: 1 if this is the last close, otherwise 0 */ int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct file *filp) { unsigned long flags; if (tty_hung_up_p(filp)) return 0; spin_lock_irqsave(&port->lock, flags); if (tty->count == 1 && port->count != 1) { tty_warn(tty, "%s: tty->count = 1 port count = %d\n", __func__, port->count); port->count = 1; } if (--port->count < 0) { tty_warn(tty, "%s: bad port count (%d)\n", __func__, port->count); port->count = 0; } if (port->count) { spin_unlock_irqrestore(&port->lock, flags); return 0; } spin_unlock_irqrestore(&port->lock, flags); tty->closing = 1; if (tty_port_initialized(port)) { /* Don't block on a stalled port, just pull the chain */ if (tty->flow.tco_stopped) tty_driver_flush_buffer(tty); if (port->closing_wait != ASYNC_CLOSING_WAIT_NONE) tty_wait_until_sent(tty, port->closing_wait); if (port->drain_delay) tty_port_drain_delay(port, tty); } /* Flush the ldisc buffering */ tty_ldisc_flush(tty); /* Report to caller this is the last port reference */ return 1; } EXPORT_SYMBOL(tty_port_close_start); /** * tty_port_close_end - helper for tty->ops->close, part 2/2 * @port: tty_port of the device * @tty: tty being closed * * This is a continuation of the first part: tty_port_close_start(). This * should be called after turning off the device. It flushes the data from the * line discipline and delays the close by @port->close_delay. * * Locking: Caller holds tty lock. */ void tty_port_close_end(struct tty_port *port, struct tty_struct *tty) { unsigned long flags; tty_ldisc_flush(tty); tty->closing = 0; spin_lock_irqsave(&port->lock, flags); if (port->blocked_open) { spin_unlock_irqrestore(&port->lock, flags); if (port->close_delay) msleep_interruptible(jiffies_to_msecs(port->close_delay)); spin_lock_irqsave(&port->lock, flags); wake_up_interruptible(&port->open_wait); } spin_unlock_irqrestore(&port->lock, flags); tty_port_set_active(port, false); } EXPORT_SYMBOL(tty_port_close_end); /** * tty_port_close - generic tty->ops->close handler * @port: tty_port of the device * @tty: tty being closed * @filp: passed file pointer * * It is a generic helper to be used in driver's @tty->ops->close. It wraps a * sequence of tty_port_close_start(), tty_port_shutdown(), and * tty_port_close_end(). The latter two are called only if this is the last * close. See the respective functions for the details. * * Locking: Caller holds tty lock */ void tty_port_close(struct tty_port *port, struct tty_struct *tty, struct file *filp) { if (tty_port_close_start(port, tty, filp) == 0) return; tty_port_shutdown(port, tty); if (!port->console) set_bit(TTY_IO_ERROR, &tty->flags); tty_port_close_end(port, tty); tty_port_tty_set(port, NULL); } EXPORT_SYMBOL(tty_port_close); /** * tty_port_install - generic tty->ops->install handler * @port: tty_port of the device * @driver: tty_driver for this device * @tty: tty to be installed * * It is the same as tty_standard_install() except the provided @port is linked * to a concrete tty specified by @tty. Use this or tty_port_register_device() * (or both). Call tty_port_link_device() as a last resort. */ int tty_port_install(struct tty_port *port, struct tty_driver *driver, struct tty_struct *tty) { tty->port = port; return tty_standard_install(driver, tty); } EXPORT_SYMBOL_GPL(tty_port_install); /** * tty_port_open - generic tty->ops->open handler * @port: tty_port of the device * @tty: tty to be opened * @filp: passed file pointer * * It is a generic helper to be used in driver's @tty->ops->open. It activates * the devices using @port->ops->activate if not active already. And waits for * the device to be ready using tty_port_block_til_ready() (e.g. raises * DTR/CTS and waits for carrier). * * Note that @port->ops->shutdown is not called when @port->ops->activate * returns an error (on the contrary, @tty->ops->close is). * * Locking: Caller holds tty lock. * * Note: may drop and reacquire tty lock (in tty_port_block_til_ready()) so * @tty and @port may have changed state (eg., may be hung up now). */ int tty_port_open(struct tty_port *port, struct tty_struct *tty, struct file *filp) { spin_lock_irq(&port->lock); ++port->count; spin_unlock_irq(&port->lock); tty_port_tty_set(port, tty); /* * Do the device-specific open only if the hardware isn't * already initialized. Serialize open and shutdown using the * port mutex. */ mutex_lock(&port->mutex); if (!tty_port_initialized(port)) { clear_bit(TTY_IO_ERROR, &tty->flags); if (port->ops->activate) { int retval = port->ops->activate(port, tty); if (retval) { mutex_unlock(&port->mutex); return retval; } } tty_port_set_initialized(port, true); } mutex_unlock(&port->mutex); return tty_port_block_til_ready(port, tty, filp); } EXPORT_SYMBOL(tty_port_open);
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2020 Google Corporation */ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/mgmt.h> #include "mgmt_util.h" #include "mgmt_config.h" #define HDEV_PARAM_U16(_param_name_) \ struct {\ struct mgmt_tlv entry; \ __le16 value; \ } __packed _param_name_ #define HDEV_PARAM_U8(_param_name_) \ struct {\ struct mgmt_tlv entry; \ __u8 value; \ } __packed _param_name_ #define TLV_SET_U16(_param_code_, _param_name_) \ { \ { cpu_to_le16(_param_code_), sizeof(__u16) }, \ cpu_to_le16(hdev->_param_name_) \ } #define TLV_SET_U8(_param_code_, _param_name_) \ { \ { cpu_to_le16(_param_code_), sizeof(__u8) }, \ hdev->_param_name_ \ } #define TLV_SET_U16_JIFFIES_TO_MSECS(_param_code_, _param_name_) \ { \ { cpu_to_le16(_param_code_), sizeof(__u16) }, \ cpu_to_le16(jiffies_to_msecs(hdev->_param_name_)) \ } int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { int ret; struct mgmt_rp_read_def_system_config { /* Please see mgmt-api.txt for documentation of these values */ HDEV_PARAM_U16(def_page_scan_type); HDEV_PARAM_U16(def_page_scan_int); HDEV_PARAM_U16(def_page_scan_window); HDEV_PARAM_U16(def_inq_scan_type); HDEV_PARAM_U16(def_inq_scan_int); HDEV_PARAM_U16(def_inq_scan_window); HDEV_PARAM_U16(def_br_lsto); HDEV_PARAM_U16(def_page_timeout); HDEV_PARAM_U16(sniff_min_interval); HDEV_PARAM_U16(sniff_max_interval); HDEV_PARAM_U16(le_adv_min_interval); HDEV_PARAM_U16(le_adv_max_interval); HDEV_PARAM_U16(def_multi_adv_rotation_duration); HDEV_PARAM_U16(le_scan_interval); HDEV_PARAM_U16(le_scan_window); HDEV_PARAM_U16(le_scan_int_suspend); HDEV_PARAM_U16(le_scan_window_suspend); HDEV_PARAM_U16(le_scan_int_discovery); HDEV_PARAM_U16(le_scan_window_discovery); HDEV_PARAM_U16(le_scan_int_adv_monitor); HDEV_PARAM_U16(le_scan_window_adv_monitor); HDEV_PARAM_U16(le_scan_int_connect); HDEV_PARAM_U16(le_scan_window_connect); HDEV_PARAM_U16(le_conn_min_interval); HDEV_PARAM_U16(le_conn_max_interval); HDEV_PARAM_U16(le_conn_latency); HDEV_PARAM_U16(le_supv_timeout); HDEV_PARAM_U16(def_le_autoconnect_timeout); HDEV_PARAM_U16(advmon_allowlist_duration); HDEV_PARAM_U16(advmon_no_filter_duration); HDEV_PARAM_U8(enable_advmon_interleave_scan); } __packed rp = { TLV_SET_U16(0x0000, def_page_scan_type), TLV_SET_U16(0x0001, def_page_scan_int), TLV_SET_U16(0x0002, def_page_scan_window), TLV_SET_U16(0x0003, def_inq_scan_type), TLV_SET_U16(0x0004, def_inq_scan_int), TLV_SET_U16(0x0005, def_inq_scan_window), TLV_SET_U16(0x0006, def_br_lsto), TLV_SET_U16(0x0007, def_page_timeout), TLV_SET_U16(0x0008, sniff_min_interval), TLV_SET_U16(0x0009, sniff_max_interval), TLV_SET_U16(0x000a, le_adv_min_interval), TLV_SET_U16(0x000b, le_adv_max_interval), TLV_SET_U16(0x000c, def_multi_adv_rotation_duration), TLV_SET_U16(0x000d, le_scan_interval), TLV_SET_U16(0x000e, le_scan_window), TLV_SET_U16(0x000f, le_scan_int_suspend), TLV_SET_U16(0x0010, le_scan_window_suspend), TLV_SET_U16(0x0011, le_scan_int_discovery), TLV_SET_U16(0x0012, le_scan_window_discovery), TLV_SET_U16(0x0013, le_scan_int_adv_monitor), TLV_SET_U16(0x0014, le_scan_window_adv_monitor), TLV_SET_U16(0x0015, le_scan_int_connect), TLV_SET_U16(0x0016, le_scan_window_connect), TLV_SET_U16(0x0017, le_conn_min_interval), TLV_SET_U16(0x0018, le_conn_max_interval), TLV_SET_U16(0x0019, le_conn_latency), TLV_SET_U16(0x001a, le_supv_timeout), TLV_SET_U16_JIFFIES_TO_MSECS(0x001b, def_le_autoconnect_timeout), TLV_SET_U16(0x001d, advmon_allowlist_duration), TLV_SET_U16(0x001e, advmon_no_filter_duration), TLV_SET_U8(0x001f, enable_advmon_interleave_scan), }; bt_dev_dbg(hdev, "sock %p", sk); ret = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_DEF_SYSTEM_CONFIG, 0, &rp, sizeof(rp)); return ret; } #define TO_TLV(x) ((struct mgmt_tlv *)(x)) #define TLV_GET_LE16(tlv) le16_to_cpu(*((__le16 *)(TO_TLV(tlv)->value))) #define TLV_GET_U8(tlv) (*((__u8 *)(TO_TLV(tlv)->value))) int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { u16 buffer_left = data_len; u8 *buffer = data; if (buffer_left < sizeof(struct mgmt_tlv)) { return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEF_SYSTEM_CONFIG, MGMT_STATUS_INVALID_PARAMS); } /* First pass to validate the tlv */ while (buffer_left >= sizeof(struct mgmt_tlv)) { const u8 len = TO_TLV(buffer)->length; size_t exp_type_len; const u16 exp_len = sizeof(struct mgmt_tlv) + len; const u16 type = le16_to_cpu(TO_TLV(buffer)->type); if (buffer_left < exp_len) { bt_dev_warn(hdev, "invalid len left %u, exp >= %u", buffer_left, exp_len); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEF_SYSTEM_CONFIG, MGMT_STATUS_INVALID_PARAMS); } /* Please see mgmt-api.txt for documentation of these values */ switch (type) { case 0x0000: case 0x0001: case 0x0002: case 0x0003: case 0x0004: case 0x0005: case 0x0006: case 0x0007: case 0x0008: case 0x0009: case 0x000a: case 0x000b: case 0x000c: case 0x000d: case 0x000e: case 0x000f: case 0x0010: case 0x0011: case 0x0012: case 0x0013: case 0x0014: case 0x0015: case 0x0016: case 0x0017: case 0x0018: case 0x0019: case 0x001a: case 0x001b: case 0x001d: case 0x001e: exp_type_len = sizeof(u16); break; case 0x001f: exp_type_len = sizeof(u8); break; default: exp_type_len = 0; bt_dev_warn(hdev, "unsupported parameter %u", type); break; } if (exp_type_len && len != exp_type_len) { bt_dev_warn(hdev, "invalid length %d, exp %zu for type %u", len, exp_type_len, type); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEF_SYSTEM_CONFIG, MGMT_STATUS_INVALID_PARAMS); } buffer_left -= exp_len; buffer += exp_len; } buffer_left = data_len; buffer = data; while (buffer_left >= sizeof(struct mgmt_tlv)) { const u8 len = TO_TLV(buffer)->length; const u16 exp_len = sizeof(struct mgmt_tlv) + len; const u16 type = le16_to_cpu(TO_TLV(buffer)->type); switch (type) { case 0x0000: hdev->def_page_scan_type = TLV_GET_LE16(buffer); break; case 0x0001: hdev->def_page_scan_int = TLV_GET_LE16(buffer); break; case 0x0002: hdev->def_page_scan_window = TLV_GET_LE16(buffer); break; case 0x0003: hdev->def_inq_scan_type = TLV_GET_LE16(buffer); break; case 0x0004: hdev->def_inq_scan_int = TLV_GET_LE16(buffer); break; case 0x0005: hdev->def_inq_scan_window = TLV_GET_LE16(buffer); break; case 0x0006: hdev->def_br_lsto = TLV_GET_LE16(buffer); break; case 0x0007: hdev->def_page_timeout = TLV_GET_LE16(buffer); break; case 0x0008: hdev->sniff_min_interval = TLV_GET_LE16(buffer); break; case 0x0009: hdev->sniff_max_interval = TLV_GET_LE16(buffer); break; case 0x000a: hdev->le_adv_min_interval = TLV_GET_LE16(buffer); break; case 0x000b: hdev->le_adv_max_interval = TLV_GET_LE16(buffer); break; case 0x000c: hdev->def_multi_adv_rotation_duration = TLV_GET_LE16(buffer); break; case 0x000d: hdev->le_scan_interval = TLV_GET_LE16(buffer); break; case 0x000e: hdev->le_scan_window = TLV_GET_LE16(buffer); break; case 0x000f: hdev->le_scan_int_suspend = TLV_GET_LE16(buffer); break; case 0x0010: hdev->le_scan_window_suspend = TLV_GET_LE16(buffer); break; case 0x0011: hdev->le_scan_int_discovery = TLV_GET_LE16(buffer); break; case 0x00012: hdev->le_scan_window_discovery = TLV_GET_LE16(buffer); break; case 0x00013: hdev->le_scan_int_adv_monitor = TLV_GET_LE16(buffer); break; case 0x00014: hdev->le_scan_window_adv_monitor = TLV_GET_LE16(buffer); break; case 0x00015: hdev->le_scan_int_connect = TLV_GET_LE16(buffer); break; case 0x00016: hdev->le_scan_window_connect = TLV_GET_LE16(buffer); break; case 0x00017: hdev->le_conn_min_interval = TLV_GET_LE16(buffer); break; case 0x00018: hdev->le_conn_max_interval = TLV_GET_LE16(buffer); break; case 0x00019: hdev->le_conn_latency = TLV_GET_LE16(buffer); break; case 0x0001a: hdev->le_supv_timeout = TLV_GET_LE16(buffer); break; case 0x0001b: hdev->def_le_autoconnect_timeout = msecs_to_jiffies(TLV_GET_LE16(buffer)); break; case 0x0001d: hdev->advmon_allowlist_duration = TLV_GET_LE16(buffer); break; case 0x0001e: hdev->advmon_no_filter_duration = TLV_GET_LE16(buffer); break; case 0x0001f: hdev->enable_advmon_interleave_scan = TLV_GET_U8(buffer); break; default: bt_dev_warn(hdev, "unsupported parameter %u", type); break; } buffer_left -= exp_len; buffer += exp_len; } return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEF_SYSTEM_CONFIG, 0, NULL, 0); } int read_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { bt_dev_dbg(hdev, "sock %p", sk); return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_DEF_RUNTIME_CONFIG, 0, NULL, 0); } int set_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { bt_dev_dbg(hdev, "sock %p", sk); return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEF_SYSTEM_CONFIG, MGMT_STATUS_INVALID_PARAMS); }
674 670 671 669 678 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * netprio_cgroup.h Control Group Priority set * * Authors: Neil Horman <nhorman@tuxdriver.com> */ #ifndef _NETPRIO_CGROUP_H #define _NETPRIO_CGROUP_H #include <linux/cgroup.h> #include <linux/hardirq.h> #include <linux/rcupdate.h> #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map { struct rcu_head rcu; u32 priomap_len; u32 priomap[]; }; static inline u32 task_netprioidx(struct task_struct *p) { struct cgroup_subsys_state *css; u32 idx; rcu_read_lock(); css = task_css(p, net_prio_cgrp_id); idx = css->id; rcu_read_unlock(); return idx; } static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd) { if (in_interrupt()) return; sock_cgroup_set_prioidx(skcd, task_netprioidx(current)); } #else /* !CONFIG_CGROUP_NET_PRIO */ static inline u32 task_netprioidx(struct task_struct *p) { return 0; } static inline void sock_update_netprioidx(struct sock_cgroup_data *skcd) { } #endif /* CONFIG_CGROUP_NET_PRIO */ #endif /* _NET_CLS_CGROUP_H */
154 152 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 // SPDX-License-Identifier: GPL-2.0 /* * Helpers for IOMMU drivers implementing SVA */ #include <linux/mmu_context.h> #include <linux/mutex.h> #include <linux/sched/mm.h> #include <linux/iommu.h> #include "iommu-priv.h" static DEFINE_MUTEX(iommu_sva_lock); static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm); /* Allocate a PASID for the mm within range (inclusive) */ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct device *dev) { struct iommu_mm_data *iommu_mm; ioasid_t pasid; lockdep_assert_held(&iommu_sva_lock); if (!arch_pgtable_dma_compat(mm)) return ERR_PTR(-EBUSY); iommu_mm = mm->iommu_mm; /* Is a PASID already associated with this mm? */ if (iommu_mm) { if (iommu_mm->pasid >= dev->iommu->max_pasids) return ERR_PTR(-EOVERFLOW); return iommu_mm; } iommu_mm = kzalloc(sizeof(struct iommu_mm_data), GFP_KERNEL); if (!iommu_mm) return ERR_PTR(-ENOMEM); pasid = iommu_alloc_global_pasid(dev); if (pasid == IOMMU_PASID_INVALID) { kfree(iommu_mm); return ERR_PTR(-ENOSPC); } iommu_mm->pasid = pasid; INIT_LIST_HEAD(&iommu_mm->sva_domains); /* * Make sure the write to mm->iommu_mm is not reordered in front of * initialization to iommu_mm fields. If it does, readers may see a * valid iommu_mm with uninitialized values. */ smp_store_release(&mm->iommu_mm, iommu_mm); return iommu_mm; } /** * iommu_sva_bind_device() - Bind a process address space to a device * @dev: the device * @mm: the mm to bind, caller must hold a reference to mm_users * * Create a bond between device and address space, allowing the device to * access the mm using the PASID returned by iommu_sva_get_pasid(). If a * bond already exists between @device and @mm, an additional internal * reference is taken. Caller must call iommu_sva_unbind_device() * to release each reference. * * iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) must be called first, to * initialize the required SVA features. * * On error, returns an ERR_PTR value. */ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) { struct iommu_group *group = dev->iommu_group; struct iommu_attach_handle *attach_handle; struct iommu_mm_data *iommu_mm; struct iommu_domain *domain; struct iommu_sva *handle; int ret; if (!group) return ERR_PTR(-ENODEV); mutex_lock(&iommu_sva_lock); /* Allocate mm->pasid if necessary. */ iommu_mm = iommu_alloc_mm_data(mm, dev); if (IS_ERR(iommu_mm)) { ret = PTR_ERR(iommu_mm); goto out_unlock; } /* A bond already exists, just take a reference`. */ attach_handle = iommu_attach_handle_get(group, iommu_mm->pasid, IOMMU_DOMAIN_SVA); if (!IS_ERR(attach_handle)) { handle = container_of(attach_handle, struct iommu_sva, handle); if (attach_handle->domain->mm != mm) { ret = -EBUSY; goto out_unlock; } refcount_inc(&handle->users); mutex_unlock(&iommu_sva_lock); return handle; } if (PTR_ERR(attach_handle) != -ENOENT) { ret = PTR_ERR(attach_handle); goto out_unlock; } handle = kzalloc(sizeof(*handle), GFP_KERNEL); if (!handle) { ret = -ENOMEM; goto out_unlock; } /* Search for an existing domain. */ list_for_each_entry(domain, &mm->iommu_mm->sva_domains, next) { ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid, &handle->handle); if (!ret) { domain->users++; goto out; } } /* Allocate a new domain and set it on device pasid. */ domain = iommu_sva_domain_alloc(dev, mm); if (IS_ERR(domain)) { ret = PTR_ERR(domain); goto out_free_handle; } ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid, &handle->handle); if (ret) goto out_free_domain; domain->users = 1; list_add(&domain->next, &mm->iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock); handle->dev = dev; return handle; out_free_domain: iommu_domain_free(domain); out_free_handle: kfree(handle); out_unlock: mutex_unlock(&iommu_sva_lock); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(iommu_sva_bind_device); /** * iommu_sva_unbind_device() - Remove a bond created with iommu_sva_bind_device * @handle: the handle returned by iommu_sva_bind_device() * * Put reference to a bond between device and address space. The device should * not be issuing any more transaction for this PASID. All outstanding page * requests for this PASID must have been flushed to the IOMMU. */ void iommu_sva_unbind_device(struct iommu_sva *handle) { struct iommu_domain *domain = handle->handle.domain; struct iommu_mm_data *iommu_mm = domain->mm->iommu_mm; struct device *dev = handle->dev; mutex_lock(&iommu_sva_lock); if (!refcount_dec_and_test(&handle->users)) { mutex_unlock(&iommu_sva_lock); return; } iommu_detach_device_pasid(domain, dev, iommu_mm->pasid); if (--domain->users == 0) { list_del(&domain->next); iommu_domain_free(domain); } mutex_unlock(&iommu_sva_lock); kfree(handle); } EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); u32 iommu_sva_get_pasid(struct iommu_sva *handle) { struct iommu_domain *domain = handle->handle.domain; return mm_get_enqcmd_pasid(domain->mm); } EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); void mm_pasid_drop(struct mm_struct *mm) { struct iommu_mm_data *iommu_mm = mm->iommu_mm; if (!iommu_mm) return; iommu_free_global_pasid(iommu_mm->pasid); kfree(iommu_mm); } /* * I/O page fault handler for SVA */ static enum iommu_page_response_code iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm) { vm_fault_t ret; struct vm_area_struct *vma; unsigned int access_flags = 0; unsigned int fault_flags = FAULT_FLAG_REMOTE; struct iommu_fault_page_request *prm = &fault->prm; enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID; if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)) return status; if (!mmget_not_zero(mm)) return status; mmap_read_lock(mm); vma = vma_lookup(mm, prm->addr); if (!vma) /* Unmapped area */ goto out_put_mm; if (prm->perm & IOMMU_FAULT_PERM_READ) access_flags |= VM_READ; if (prm->perm & IOMMU_FAULT_PERM_WRITE) { access_flags |= VM_WRITE; fault_flags |= FAULT_FLAG_WRITE; } if (prm->perm & IOMMU_FAULT_PERM_EXEC) { access_flags |= VM_EXEC; fault_flags |= FAULT_FLAG_INSTRUCTION; } if (!(prm->perm & IOMMU_FAULT_PERM_PRIV)) fault_flags |= FAULT_FLAG_USER; if (access_flags & ~vma->vm_flags) /* Access fault */ goto out_put_mm; ret = handle_mm_fault(vma, prm->addr, fault_flags, NULL); status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID : IOMMU_PAGE_RESP_SUCCESS; out_put_mm: mmap_read_unlock(mm); mmput(mm); return status; } static void iommu_sva_handle_iopf(struct work_struct *work) { struct iopf_fault *iopf; struct iopf_group *group; enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS; group = container_of(work, struct iopf_group, work); list_for_each_entry(iopf, &group->faults, list) { /* * For the moment, errors are sticky: don't handle subsequent * faults in the group if there is an error. */ if (status != IOMMU_PAGE_RESP_SUCCESS) break; status = iommu_sva_handle_mm(&iopf->fault, group->attach_handle->domain->mm); } iopf_group_response(group, status); iopf_free_group(group); } static int iommu_sva_iopf_handler(struct iopf_group *group) { struct iommu_fault_param *fault_param = group->fault_param; INIT_WORK(&group->work, iommu_sva_handle_iopf); if (!queue_work(fault_param->queue->wq, &group->work)) return -EBUSY; return 0; } static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm) { const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; if (ops->domain_alloc_sva) { domain = ops->domain_alloc_sva(dev, mm); if (IS_ERR(domain)) return domain; } else { domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); if (!domain) return ERR_PTR(-ENOMEM); } domain->type = IOMMU_DOMAIN_SVA; mmgrab(mm); domain->mm = mm; domain->owner = ops; domain->iopf_handler = iommu_sva_iopf_handler; return domain; }
2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched/signal.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/gso.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <net/xdp.h> #include <linux/virtio_net.h> #include <linux/skb_array.h> #define TAP_IFFEATURES (IFF_VNET_HDR | IFF_MULTI_QUEUE) #define TAP_VNET_LE 0x80000000 #define TAP_VNET_BE 0x40000000 #ifdef CONFIG_TUN_VNET_CROSS_LE static inline bool tap_legacy_is_little_endian(struct tap_queue *q) { return q->flags & TAP_VNET_BE ? false : virtio_legacy_is_little_endian(); } static long tap_get_vnet_be(struct tap_queue *q, int __user *sp) { int s = !!(q->flags & TAP_VNET_BE); if (put_user(s, sp)) return -EFAULT; return 0; } static long tap_set_vnet_be(struct tap_queue *q, int __user *sp) { int s; if (get_user(s, sp)) return -EFAULT; if (s) q->flags |= TAP_VNET_BE; else q->flags &= ~TAP_VNET_BE; return 0; } #else static inline bool tap_legacy_is_little_endian(struct tap_queue *q) { return virtio_legacy_is_little_endian(); } static long tap_get_vnet_be(struct tap_queue *q, int __user *argp) { return -EINVAL; } static long tap_set_vnet_be(struct tap_queue *q, int __user *argp) { return -EINVAL; } #endif /* CONFIG_TUN_VNET_CROSS_LE */ static inline bool tap_is_little_endian(struct tap_queue *q) { return q->flags & TAP_VNET_LE || tap_legacy_is_little_endian(q); } static inline u16 tap16_to_cpu(struct tap_queue *q, __virtio16 val) { return __virtio16_to_cpu(tap_is_little_endian(q), val); } static inline __virtio16 cpu_to_tap16(struct tap_queue *q, u16 val) { return __cpu_to_virtio16(tap_is_little_endian(q), val); } static struct proto tap_proto = { .name = "tap", .owner = THIS_MODULE, .obj_size = sizeof(struct tap_queue), }; #define TAP_NUM_DEVS (1U << MINORBITS) static LIST_HEAD(major_list); struct major_info { struct rcu_head rcu; dev_t major; struct idr minor_idr; spinlock_t minor_lock; const char *device_name; struct list_head next; }; #define GOODCOPY_LEN 128 static const struct proto_ops tap_socket_ops; #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO) #define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG | NETIF_F_FRAGLIST) static struct tap_dev *tap_dev_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } /* * RCU usage: * The tap_queue and the macvlan_dev are loosely coupled, the * pointers from one to the other can only be read while rcu_read_lock * or rtnl is held. * * Both the file and the macvlan_dev hold a reference on the tap_queue * through sock_hold(&q->sk). When the macvlan_dev goes away first, * q->vlan becomes inaccessible. When the files gets closed, * tap_get_queue() fails. * * There may still be references to the struct sock inside of the * queue from outbound SKBs, but these never reference back to the * file or the dev. The data structure is freed through __sk_free * when both our references and any pending SKBs are gone. */ static int tap_enable_queue(struct tap_dev *tap, struct file *file, struct tap_queue *q) { int err = -EINVAL; ASSERT_RTNL(); if (q->enabled) goto out; err = 0; rcu_assign_pointer(tap->taps[tap->numvtaps], q); q->queue_index = tap->numvtaps; q->enabled = true; tap->numvtaps++; out: return err; } /* Requires RTNL */ static int tap_set_queue(struct tap_dev *tap, struct file *file, struct tap_queue *q) { if (tap->numqueues == MAX_TAP_QUEUES) return -EBUSY; rcu_assign_pointer(q->tap, tap); rcu_assign_pointer(tap->taps[tap->numvtaps], q); sock_hold(&q->sk); q->file = file; q->queue_index = tap->numvtaps; q->enabled = true; file->private_data = q; list_add_tail(&q->next, &tap->queue_list); tap->numvtaps++; tap->numqueues++; return 0; } static int tap_disable_queue(struct tap_queue *q) { struct tap_dev *tap; struct tap_queue *nq; ASSERT_RTNL(); if (!q->enabled) return -EINVAL; tap = rtnl_dereference(q->tap); if (tap) { int index = q->queue_index; BUG_ON(index >= tap->numvtaps); nq = rtnl_dereference(tap->taps[tap->numvtaps - 1]); nq->queue_index = index; rcu_assign_pointer(tap->taps[index], nq); RCU_INIT_POINTER(tap->taps[tap->numvtaps - 1], NULL); q->enabled = false; tap->numvtaps--; } return 0; } /* * The file owning the queue got closed, give up both * the reference that the files holds as well as the * one from the macvlan_dev if that still exists. * * Using the spinlock makes sure that we don't get * to the queue again after destroying it. */ static void tap_put_queue(struct tap_queue *q) { struct tap_dev *tap; rtnl_lock(); tap = rtnl_dereference(q->tap); if (tap) { if (q->enabled) BUG_ON(tap_disable_queue(q)); tap->numqueues--; RCU_INIT_POINTER(q->tap, NULL); sock_put(&q->sk); list_del_init(&q->next); } rtnl_unlock(); synchronize_rcu(); sock_put(&q->sk); } /* * Select a queue based on the rxq of the device on which this packet * arrived. If the incoming device is not mq, calculate a flow hash * to select a queue. If all fails, find the first available queue. * Cache vlan->numvtaps since it can become zero during the execution * of this function. */ static struct tap_queue *tap_get_queue(struct tap_dev *tap, struct sk_buff *skb) { struct tap_queue *queue = NULL; /* Access to taps array is protected by rcu, but access to numvtaps * isn't. Below we use it to lookup a queue, but treat it as a hint * and validate that the result isn't NULL - in case we are * racing against queue removal. */ int numvtaps = READ_ONCE(tap->numvtaps); __u32 rxq; if (!numvtaps) goto out; if (numvtaps == 1) goto single; /* Check if we can use flow to select a queue */ rxq = skb_get_hash(skb); if (rxq) { queue = rcu_dereference(tap->taps[rxq % numvtaps]); goto out; } if (likely(skb_rx_queue_recorded(skb))) { rxq = skb_get_rx_queue(skb); while (unlikely(rxq >= numvtaps)) rxq -= numvtaps; queue = rcu_dereference(tap->taps[rxq]); goto out; } single: queue = rcu_dereference(tap->taps[0]); out: return queue; } /* * The net_device is going away, give up the reference * that it holds on all queues and safely set the pointer * from the queues to NULL. */ void tap_del_queues(struct tap_dev *tap) { struct tap_queue *q, *tmp; ASSERT_RTNL(); list_for_each_entry_safe(q, tmp, &tap->queue_list, next) { list_del_init(&q->next); RCU_INIT_POINTER(q->tap, NULL); if (q->enabled) tap->numvtaps--; tap->numqueues--; sock_put(&q->sk); } BUG_ON(tap->numvtaps); BUG_ON(tap->numqueues); /* guarantee that any future tap_set_queue will fail */ tap->numvtaps = MAX_TAP_QUEUES; } EXPORT_SYMBOL_GPL(tap_del_queues); rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct net_device *dev = skb->dev; struct tap_dev *tap; struct tap_queue *q; netdev_features_t features = TAP_FEATURES; enum skb_drop_reason drop_reason; tap = tap_dev_get_rcu(dev); if (!tap) return RX_HANDLER_PASS; q = tap_get_queue(tap, skb); if (!q) return RX_HANDLER_PASS; skb_push(skb, ETH_HLEN); /* Apply the forward feature mask so that we perform segmentation * according to users wishes. This only works if VNET_HDR is * enabled. */ if (q->flags & IFF_VNET_HDR) features |= tap->tap_features; if (netif_needs_gso(skb, features)) { struct sk_buff *segs = __skb_gso_segment(skb, features, false); struct sk_buff *next; if (IS_ERR(segs)) { drop_reason = SKB_DROP_REASON_SKB_GSO_SEG; goto drop; } if (!segs) { if (ptr_ring_produce(&q->ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } goto wake_up; } consume_skb(skb); skb_list_walk_safe(segs, skb, next) { skb_mark_not_on_list(skb); if (ptr_ring_produce(&q->ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; kfree_skb_reason(skb, drop_reason); kfree_skb_list_reason(next, drop_reason); break; } } } else { /* If we receive a partial checksum and the tap side * doesn't support checksum offload, compute the checksum. * Note: it doesn't matter which checksum feature to * check, we either support them all or none. */ if (skb->ip_summed == CHECKSUM_PARTIAL && !(features & NETIF_F_CSUM_MASK) && skb_checksum_help(skb)) { drop_reason = SKB_DROP_REASON_SKB_CSUM; goto drop; } if (ptr_ring_produce(&q->ring, skb)) { drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; } } wake_up: wake_up_interruptible_poll(sk_sleep(&q->sk), EPOLLIN | EPOLLRDNORM | EPOLLRDBAND); return RX_HANDLER_CONSUMED; drop: /* Count errors/drops only here, thus don't care about args. */ if (tap->count_rx_dropped) tap->count_rx_dropped(tap); kfree_skb_reason(skb, drop_reason); return RX_HANDLER_CONSUMED; } EXPORT_SYMBOL_GPL(tap_handle_frame); static struct major_info *tap_get_major(int major) { struct major_info *tap_major; list_for_each_entry_rcu(tap_major, &major_list, next) { if (tap_major->major == major) return tap_major; } return NULL; } int tap_get_minor(dev_t major, struct tap_dev *tap) { int retval = -ENOMEM; struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(MAJOR(major)); if (!tap_major) { retval = -EINVAL; goto unlock; } spin_lock(&tap_major->minor_lock); retval = idr_alloc(&tap_major->minor_idr, tap, 1, TAP_NUM_DEVS, GFP_ATOMIC); if (retval >= 0) { tap->minor = retval; } else if (retval == -ENOSPC) { netdev_err(tap->dev, "Too many tap devices\n"); retval = -EINVAL; } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); return retval < 0 ? retval : 0; } EXPORT_SYMBOL_GPL(tap_get_minor); void tap_free_minor(dev_t major, struct tap_dev *tap) { struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(MAJOR(major)); if (!tap_major) { goto unlock; } spin_lock(&tap_major->minor_lock); if (tap->minor) { idr_remove(&tap_major->minor_idr, tap->minor); tap->minor = 0; } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(tap_free_minor); static struct tap_dev *dev_get_by_tap_file(int major, int minor) { struct net_device *dev = NULL; struct tap_dev *tap; struct major_info *tap_major; rcu_read_lock(); tap_major = tap_get_major(major); if (!tap_major) { tap = NULL; goto unlock; } spin_lock(&tap_major->minor_lock); tap = idr_find(&tap_major->minor_idr, minor); if (tap) { dev = tap->dev; dev_hold(dev); } spin_unlock(&tap_major->minor_lock); unlock: rcu_read_unlock(); return tap; } static void tap_sock_write_space(struct sock *sk) { wait_queue_head_t *wqueue; if (!sock_writeable(sk) || !test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; wqueue = sk_sleep(sk); if (wqueue && waitqueue_active(wqueue)) wake_up_interruptible_poll(wqueue, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); } static void tap_sock_destruct(struct sock *sk) { struct tap_queue *q = container_of(sk, struct tap_queue, sk); ptr_ring_cleanup(&q->ring, __skb_array_destroy_skb); } static int tap_open(struct inode *inode, struct file *file) { struct net *net = current->nsproxy->net_ns; struct tap_dev *tap; struct tap_queue *q; int err = -ENODEV; rtnl_lock(); tap = dev_get_by_tap_file(imajor(inode), iminor(inode)); if (!tap) goto err; err = -ENOMEM; q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tap_proto, 0); if (!q) goto err; if (ptr_ring_init(&q->ring, tap->dev->tx_queue_len, GFP_KERNEL)) { sk_free(&q->sk); goto err; } init_waitqueue_head(&q->sock.wq.wait); q->sock.type = SOCK_RAW; q->sock.state = SS_CONNECTED; q->sock.file = file; q->sock.ops = &tap_socket_ops; sock_init_data_uid(&q->sock, &q->sk, current_fsuid()); q->sk.sk_write_space = tap_sock_write_space; q->sk.sk_destruct = tap_sock_destruct; q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP; q->vnet_hdr_sz = sizeof(struct virtio_net_hdr); /* * so far only KVM virtio_net uses tap, enable zero copy between * guest kernel and host kernel when lower device supports zerocopy * * The macvlan supports zerocopy iff the lower device supports zero * copy so we don't have to look at the lower device directly. */ if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG)) sock_set_flag(&q->sk, SOCK_ZEROCOPY); err = tap_set_queue(tap, file, q); if (err) { /* tap_sock_destruct() will take care of freeing ptr_ring */ goto err_put; } /* tap groks IOCB_NOWAIT just fine, mark it as such */ file->f_mode |= FMODE_NOWAIT; dev_put(tap->dev); rtnl_unlock(); return err; err_put: sock_put(&q->sk); err: if (tap) dev_put(tap->dev); rtnl_unlock(); return err; } static int tap_release(struct inode *inode, struct file *file) { struct tap_queue *q = file->private_data; tap_put_queue(q); return 0; } static __poll_t tap_poll(struct file *file, poll_table *wait) { struct tap_queue *q = file->private_data; __poll_t mask = EPOLLERR; if (!q) goto out; mask = 0; poll_wait(file, &q->sock.wq.wait, wait); if (!ptr_ring_empty(&q->ring)) mask |= EPOLLIN | EPOLLRDNORM; if (sock_writeable(&q->sk) || (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &q->sock.flags) && sock_writeable(&q->sk))) mask |= EPOLLOUT | EPOLLWRNORM; out: return mask; } static inline struct sk_buff *tap_alloc_skb(struct sock *sk, size_t prepad, size_t len, size_t linear, int noblock, int *err) { struct sk_buff *skb; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; skb_reserve(skb, prepad); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } /* Neighbour code has some assumptions on HH_DATA_MOD alignment */ #define TAP_RESERVE HH_DATA_OFF(ETH_HLEN) /* Get packet from user space buffer */ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, struct iov_iter *from, int noblock) { int good_linear = SKB_MAX_HEAD(TAP_RESERVE); struct sk_buff *skb; struct tap_dev *tap; unsigned long total_len = iov_iter_count(from); unsigned long len = total_len; int err; struct virtio_net_hdr vnet_hdr = { 0 }; int vnet_hdr_len = 0; int copylen = 0; int depth; bool zerocopy = false; size_t linear; enum skb_drop_reason drop_reason; if (q->flags & IFF_VNET_HDR) { vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); err = -EINVAL; if (len < vnet_hdr_len) goto err; len -= vnet_hdr_len; err = -EFAULT; if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from)) goto err; iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr)); if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && tap16_to_cpu(q, vnet_hdr.csum_start) + tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 > tap16_to_cpu(q, vnet_hdr.hdr_len)) vnet_hdr.hdr_len = cpu_to_tap16(q, tap16_to_cpu(q, vnet_hdr.csum_start) + tap16_to_cpu(q, vnet_hdr.csum_offset) + 2); err = -EINVAL; if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len) goto err; } err = -EINVAL; if (unlikely(len < ETH_HLEN)) goto err; if (msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) { struct iov_iter i; copylen = vnet_hdr.hdr_len ? tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN; if (copylen > good_linear) copylen = good_linear; else if (copylen < ETH_HLEN) copylen = ETH_HLEN; linear = copylen; i = *from; iov_iter_advance(&i, copylen); if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS) zerocopy = true; } if (!zerocopy) { copylen = len; linear = tap16_to_cpu(q, vnet_hdr.hdr_len); if (linear > good_linear) linear = good_linear; else if (linear < ETH_HLEN) linear = ETH_HLEN; } skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen, linear, noblock, &err); if (!skb) goto err; if (zerocopy) err = zerocopy_sg_from_iter(skb, from); else err = skb_copy_datagram_from_iter(skb, 0, from, len); if (err) { drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto err_kfree; } skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth_hdr(skb)->h_proto; rcu_read_lock(); tap = rcu_dereference(q->tap); if (!tap) { kfree_skb(skb); rcu_read_unlock(); return total_len; } skb->dev = tap->dev; if (vnet_hdr_len) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, tap_is_little_endian(q)); if (err) { rcu_read_unlock(); drop_reason = SKB_DROP_REASON_DEV_HDR; goto err_kfree; } } skb_probe_transport_header(skb); /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; uarg->ops->complete(NULL, uarg, false); } dev_queue_xmit(skb); rcu_read_unlock(); return total_len; err_kfree: kfree_skb_reason(skb, drop_reason); err: rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap && tap->count_tx_dropped) tap->count_tx_dropped(tap); rcu_read_unlock(); return err; } static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct tap_queue *q = file->private_data; int noblock = 0; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; return tap_get_user(q, NULL, from, noblock); } /* Put packet to the user space buffer */ static ssize_t tap_put_user(struct tap_queue *q, const struct sk_buff *skb, struct iov_iter *iter) { int ret; int vnet_hdr_len = 0; int vlan_offset = 0; int total; if (q->flags & IFF_VNET_HDR) { int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; struct virtio_net_hdr vnet_hdr; vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); if (iov_iter_count(iter) < vnet_hdr_len) return -EINVAL; if (virtio_net_hdr_from_skb(skb, &vnet_hdr, tap_is_little_endian(q), true, vlan_hlen)) BUG(); if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) != sizeof(vnet_hdr)) return -EFAULT; iov_iter_advance(iter, vnet_hdr_len - sizeof(vnet_hdr)); } total = vnet_hdr_len; total += skb->len; if (skb_vlan_tag_present(skb)) { struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; } veth; veth.h_vlan_proto = skb->vlan_proto; veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); total += VLAN_HLEN; ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); if (ret || !iov_iter_count(iter)) goto done; ret = copy_to_iter(&veth, sizeof(veth), iter); if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } ret = skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: return ret ? ret : total; } static ssize_t tap_do_read(struct tap_queue *q, struct iov_iter *to, int noblock, struct sk_buff *skb) { DEFINE_WAIT(wait); ssize_t ret = 0; if (!iov_iter_count(to)) { kfree_skb(skb); return 0; } if (skb) goto put; while (1) { if (!noblock) prepare_to_wait(sk_sleep(&q->sk), &wait, TASK_INTERRUPTIBLE); /* Read frames from the queue */ skb = ptr_ring_consume(&q->ring); if (skb) break; if (noblock) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } /* Nothing to read, let's sleep */ schedule(); } if (!noblock) finish_wait(sk_sleep(&q->sk), &wait); put: if (skb) { ret = tap_put_user(q, skb, to); if (unlikely(ret < 0)) kfree_skb(skb); else consume_skb(skb); } return ret; } static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct tap_queue *q = file->private_data; ssize_t len = iov_iter_count(to), ret; int noblock = 0; if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) noblock = 1; ret = tap_do_read(q, to, noblock, NULL); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; return ret; } static struct tap_dev *tap_get_tap_dev(struct tap_queue *q) { struct tap_dev *tap; ASSERT_RTNL(); tap = rtnl_dereference(q->tap); if (tap) dev_hold(tap->dev); return tap; } static void tap_put_tap_dev(struct tap_dev *tap) { dev_put(tap->dev); } static int tap_ioctl_set_queue(struct file *file, unsigned int flags) { struct tap_queue *q = file->private_data; struct tap_dev *tap; int ret; tap = tap_get_tap_dev(q); if (!tap) return -EINVAL; if (flags & IFF_ATTACH_QUEUE) ret = tap_enable_queue(tap, file, q); else if (flags & IFF_DETACH_QUEUE) ret = tap_disable_queue(q); else ret = -EINVAL; tap_put_tap_dev(tap); return ret; } static int set_offload(struct tap_queue *q, unsigned long arg) { struct tap_dev *tap; netdev_features_t features; netdev_features_t feature_mask = 0; tap = rtnl_dereference(q->tap); if (!tap) return -ENOLINK; features = tap->dev->features; if (arg & TUN_F_CSUM) { feature_mask = NETIF_F_HW_CSUM; if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) { if (arg & TUN_F_TSO_ECN) feature_mask |= NETIF_F_TSO_ECN; if (arg & TUN_F_TSO4) feature_mask |= NETIF_F_TSO; if (arg & TUN_F_TSO6) feature_mask |= NETIF_F_TSO6; } /* TODO: for now USO4 and USO6 should work simultaneously */ if ((arg & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6)) features |= NETIF_F_GSO_UDP_L4; } /* tun/tap driver inverts the usage for TSO offloads, where * setting the TSO bit means that the userspace wants to * accept TSO frames and turning it off means that user space * does not support TSO. * For tap, we have to invert it to mean the same thing. * When user space turns off TSO, we turn off GSO/LRO so that * user-space will not receive TSO frames. */ if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6) || (feature_mask & (TUN_F_USO4 | TUN_F_USO6)) == (TUN_F_USO4 | TUN_F_USO6)) features |= RX_OFFLOADS; else features &= ~RX_OFFLOADS; /* tap_features are the same as features on tun/tap and * reflect user expectations. */ tap->tap_features = feature_mask; if (tap->update_features) tap->update_features(tap, features); return 0; } /* * provide compatibility with generic tun/tap interface */ static long tap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct tap_queue *q = file->private_data; struct tap_dev *tap; void __user *argp = (void __user *)arg; struct ifreq __user *ifr = argp; unsigned int __user *up = argp; unsigned short u; int __user *sp = argp; struct sockaddr sa; int s; int ret; switch (cmd) { case TUNSETIFF: /* ignore the name, just look at flags */ if (get_user(u, &ifr->ifr_flags)) return -EFAULT; ret = 0; if ((u & ~TAP_IFFEATURES) != (IFF_NO_PI | IFF_TAP)) ret = -EINVAL; else q->flags = (q->flags & ~TAP_IFFEATURES) | u; return ret; case TUNGETIFF: rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } ret = 0; u = q->flags; if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) || put_user(u, &ifr->ifr_flags)) ret = -EFAULT; tap_put_tap_dev(tap); rtnl_unlock(); return ret; case TUNSETQUEUE: if (get_user(u, &ifr->ifr_flags)) return -EFAULT; rtnl_lock(); ret = tap_ioctl_set_queue(file, u); rtnl_unlock(); return ret; case TUNGETFEATURES: if (put_user(IFF_TAP | IFF_NO_PI | TAP_IFFEATURES, up)) return -EFAULT; return 0; case TUNSETSNDBUF: if (get_user(s, sp)) return -EFAULT; if (s <= 0) return -EINVAL; q->sk.sk_sndbuf = s; return 0; case TUNGETVNETHDRSZ: s = q->vnet_hdr_sz; if (put_user(s, sp)) return -EFAULT; return 0; case TUNSETVNETHDRSZ: if (get_user(s, sp)) return -EFAULT; if (s < (int)sizeof(struct virtio_net_hdr)) return -EINVAL; q->vnet_hdr_sz = s; return 0; case TUNGETVNETLE: s = !!(q->flags & TAP_VNET_LE); if (put_user(s, sp)) return -EFAULT; return 0; case TUNSETVNETLE: if (get_user(s, sp)) return -EFAULT; if (s) q->flags |= TAP_VNET_LE; else q->flags &= ~TAP_VNET_LE; return 0; case TUNGETVNETBE: return tap_get_vnet_be(q, sp); case TUNSETVNETBE: return tap_set_vnet_be(q, sp); case TUNSETOFFLOAD: /* let the user check for future flags */ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN | TUN_F_UFO | TUN_F_USO4 | TUN_F_USO6)) return -EINVAL; rtnl_lock(); ret = set_offload(q, arg); rtnl_unlock(); return ret; case SIOCGIFHWADDR: rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } ret = 0; dev_get_mac_address(&sa, dev_net(tap->dev), tap->dev->name); if (copy_to_user(&ifr->ifr_name, tap->dev->name, IFNAMSIZ) || copy_to_user(&ifr->ifr_hwaddr, &sa, sizeof(sa))) ret = -EFAULT; tap_put_tap_dev(tap); rtnl_unlock(); return ret; case SIOCSIFHWADDR: if (copy_from_user(&sa, &ifr->ifr_hwaddr, sizeof(sa))) return -EFAULT; rtnl_lock(); tap = tap_get_tap_dev(q); if (!tap) { rtnl_unlock(); return -ENOLINK; } ret = dev_set_mac_address_user(tap->dev, &sa, NULL); tap_put_tap_dev(tap); rtnl_unlock(); return ret; default: return -EINVAL; } } static const struct file_operations tap_fops = { .owner = THIS_MODULE, .open = tap_open, .release = tap_release, .read_iter = tap_read_iter, .write_iter = tap_write_iter, .poll = tap_poll, .unlocked_ioctl = tap_ioctl, .compat_ioctl = compat_ptr_ioctl, }; static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) { struct tun_xdp_hdr *hdr = xdp->data_hard_start; struct virtio_net_hdr *gso = &hdr->gso; int buflen = hdr->buflen; int vnet_hdr_len = 0; struct tap_dev *tap; struct sk_buff *skb; int err, depth; if (unlikely(xdp->data_end - xdp->data < ETH_HLEN)) { err = -EINVAL; goto err; } if (q->flags & IFF_VNET_HDR) vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); skb = build_skb(xdp->data_hard_start, buflen); if (!skb) { err = -ENOMEM; goto err; } skb_reserve(skb, xdp->data - xdp->data_hard_start); skb_put(skb, xdp->data_end - xdp->data); skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth_hdr(skb)->h_proto; if (vnet_hdr_len) { err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q)); if (err) goto err_kfree; } /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap) { skb->dev = tap->dev; skb_probe_transport_header(skb); dev_queue_xmit(skb); } else { kfree_skb(skb); } rcu_read_unlock(); return 0; err_kfree: kfree_skb(skb); err: rcu_read_lock(); tap = rcu_dereference(q->tap); if (tap && tap->count_tx_dropped) tap->count_tx_dropped(tap); rcu_read_unlock(); return err; } static int tap_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); struct tun_msg_ctl *ctl = m->msg_control; struct xdp_buff *xdp; int i; if (m->msg_controllen == sizeof(struct tun_msg_ctl) && ctl && ctl->type == TUN_MSG_PTR) { for (i = 0; i < ctl->num; i++) { xdp = &((struct xdp_buff *)ctl->ptr)[i]; tap_get_user_xdp(q, xdp); } return 0; } return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter, m->msg_flags & MSG_DONTWAIT); } static int tap_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); struct sk_buff *skb = m->msg_control; int ret; if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) { kfree_skb(skb); return -EINVAL; } ret = tap_do_read(q, &m->msg_iter, flags & MSG_DONTWAIT, skb); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ret = flags & MSG_TRUNC ? ret : total_len; } return ret; } static int tap_peek_len(struct socket *sock) { struct tap_queue *q = container_of(sock, struct tap_queue, sock); return PTR_RING_PEEK_CALL(&q->ring, __skb_array_len_with_tag); } /* Ops structure to mimic raw sockets with tun */ static const struct proto_ops tap_socket_ops = { .sendmsg = tap_sendmsg, .recvmsg = tap_recvmsg, .peek_len = tap_peek_len, }; /* Get an underlying socket object from tun file. Returns error unless file is * attached to a device. The returned object works like a packet socket, it * can be used for sock_sendmsg/sock_recvmsg. The caller is responsible for * holding a reference to the file for as long as the socket is in use. */ struct socket *tap_get_socket(struct file *file) { struct tap_queue *q; if (file->f_op != &tap_fops) return ERR_PTR(-EINVAL); q = file->private_data; if (!q) return ERR_PTR(-EBADFD); return &q->sock; } EXPORT_SYMBOL_GPL(tap_get_socket); struct ptr_ring *tap_get_ptr_ring(struct file *file) { struct tap_queue *q; if (file->f_op != &tap_fops) return ERR_PTR(-EINVAL); q = file->private_data; if (!q) return ERR_PTR(-EBADFD); return &q->ring; } EXPORT_SYMBOL_GPL(tap_get_ptr_ring); int tap_queue_resize(struct tap_dev *tap) { struct net_device *dev = tap->dev; struct tap_queue *q; struct ptr_ring **rings; int n = tap->numqueues; int ret, i = 0; rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL); if (!rings) return -ENOMEM; list_for_each_entry(q, &tap->queue_list, next) rings[i++] = &q->ring; ret = ptr_ring_resize_multiple(rings, n, dev->tx_queue_len, GFP_KERNEL, __skb_array_destroy_skb); kfree(rings); return ret; } EXPORT_SYMBOL_GPL(tap_queue_resize); static int tap_list_add(dev_t major, const char *device_name) { struct major_info *tap_major; tap_major = kzalloc(sizeof(*tap_major), GFP_ATOMIC); if (!tap_major) return -ENOMEM; tap_major->major = MAJOR(major); idr_init(&tap_major->minor_idr); spin_lock_init(&tap_major->minor_lock); tap_major->device_name = device_name; list_add_tail_rcu(&tap_major->next, &major_list); return 0; } int tap_create_cdev(struct cdev *tap_cdev, dev_t *tap_major, const char *device_name, struct module *module) { int err; err = alloc_chrdev_region(tap_major, 0, TAP_NUM_DEVS, device_name); if (err) goto out1; cdev_init(tap_cdev, &tap_fops); tap_cdev->owner = module; err = cdev_add(tap_cdev, *tap_major, TAP_NUM_DEVS); if (err) goto out2; err = tap_list_add(*tap_major, device_name); if (err) goto out3; return 0; out3: cdev_del(tap_cdev); out2: unregister_chrdev_region(*tap_major, TAP_NUM_DEVS); out1: return err; } EXPORT_SYMBOL_GPL(tap_create_cdev); void tap_destroy_cdev(dev_t major, struct cdev *tap_cdev) { struct major_info *tap_major, *tmp; cdev_del(tap_cdev); unregister_chrdev_region(major, TAP_NUM_DEVS); list_for_each_entry_safe(tap_major, tmp, &major_list, next) { if (tap_major->major == MAJOR(major)) { idr_destroy(&tap_major->minor_idr); list_del_rcu(&tap_major->next); kfree_rcu(tap_major, rcu); } } } EXPORT_SYMBOL_GPL(tap_destroy_cdev); MODULE_DESCRIPTION("Common library for drivers implementing the TAP interface"); MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>"); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __SHMEM_FS_H #define __SHMEM_FS_H #include <linux/file.h> #include <linux/swap.h> #include <linux/mempolicy.h> #include <linux/pagemap.h> #include <linux/percpu_counter.h> #include <linux/xattr.h> #include <linux/fs_parser.h> #include <linux/userfaultfd_k.h> /* inode in-kernel data */ #ifdef CONFIG_TMPFS_QUOTA #define SHMEM_MAXQUOTAS 2 #endif struct shmem_inode_info { spinlock_t lock; unsigned int seals; /* shmem seals */ unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ union { struct offset_ctx dir_offsets; /* stable directory offsets */ struct { struct list_head shrinklist; /* shrinkable hpage inodes */ struct list_head swaplist; /* chain of maybes on swap */ }; }; struct timespec64 i_crtime; /* file creation time */ struct shared_policy policy; /* NUMA memory alloc policy */ struct simple_xattrs xattrs; /* list of xattrs */ pgoff_t fallocend; /* highest fallocate endindex */ unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */ atomic_t stop_eviction; /* hold when working on inode */ #ifdef CONFIG_TMPFS_QUOTA struct dquot __rcu *i_dquot[MAXQUOTAS]; #endif struct inode vfs_inode; }; #define SHMEM_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | FS_CASEFOLD_FL) #define SHMEM_FL_USER_MODIFIABLE \ (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL) #define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL) struct shmem_quota_limits { qsize_t usrquota_bhardlimit; /* Default user quota block hard limit */ qsize_t usrquota_ihardlimit; /* Default user quota inode hard limit */ qsize_t grpquota_bhardlimit; /* Default group quota block hard limit */ qsize_t grpquota_ihardlimit; /* Default group quota inode hard limit */ }; struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ unsigned long free_ispace; /* How much ispace left for allocation */ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ umode_t mode; /* Mount mode for root directory */ unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ kgid_t gid; /* Mount gid for root directory */ bool full_inums; /* If i_ino should be uint or ino_t */ bool noswap; /* ignores VM reclaim / swap requests */ ino_t next_ino; /* The next per-sb inode number to use */ ino_t __percpu *ino_batch; /* The next per-cpu inode number to use */ struct mempolicy *mpol; /* default memory policy for mappings */ spinlock_t shrinklist_lock; /* Protects shrinklist */ struct list_head shrinklist; /* List of shinkable inodes */ unsigned long shrinklist_len; /* Length of shrinklist */ struct shmem_quota_limits qlimits; /* Default quota limits */ }; static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) { return container_of(inode, struct shmem_inode_info, vfs_inode); } /* * Functions in mm/shmem.c called directly from elsewhere: */ extern const struct fs_parameter_spec shmem_fs_parameters[]; extern void shmem_init(void); extern int shmem_init_fs_context(struct fs_context *fc); extern struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags); extern int shmem_zero_setup(struct vm_area_struct *); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); #ifdef CONFIG_SHMEM bool shmem_mapping(struct address_space *mapping); #else static inline bool shmem_mapping(struct address_space *mapping) { return false; } #endif /* CONFIG_SHMEM */ extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force); bool shmem_hpage_pmd_enabled(void); #else static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force) { return 0; } static inline bool shmem_hpage_pmd_enabled(void) { return false; } #endif #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); #else static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma) { return 0; } #endif extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); /* Flag allocation requirements to shmem_get_folio */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp); struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp); static inline struct folio *shmem_read_folio(struct address_space *mapping, pgoff_t index) { return shmem_read_folio_gfp(mapping, index, mapping_gfp_mask(mapping)); } static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) { return shmem_read_mapping_page_gfp(mapping, index, mapping_gfp_mask(mapping)); } static inline bool shmem_file(struct file *file) { if (!IS_ENABLED(CONFIG_SHMEM)) return false; if (!file || !file->f_mapping) return false; return shmem_mapping(file->f_mapping); } /* * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages * beyond i_size's notion of EOF, which fallocate has committed to reserving: * which split_huge_page() must therefore not delete. This use of a single * "fallocend" per inode errs on the side of not deleting a reservation when * in doubt: there are plenty of cases when it preserves unreserved pages. */ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) { return max(eof, SHMEM_I(inode)->fallocend); } extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, uffd_flags_t flags, struct folio **foliop); #else /* !CONFIG_SHMEM */ #define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ src_addr, flags, foliop) ({ BUG(); 0; }) #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */ /* * Used space is stored as unsigned 64-bit value in bytes but * quota core supports only signed 64-bit values so use that * as a limit */ #define SHMEM_QUOTA_MAX_SPC_LIMIT 0x7fffffffffffffffLL /* 2^63-1 */ #define SHMEM_QUOTA_MAX_INO_LIMIT 0x7fffffffffffffffLL #ifdef CONFIG_TMPFS_QUOTA extern const struct dquot_operations shmem_quota_operations; extern struct quota_format_type shmem_quota_format; #endif /* CONFIG_TMPFS_QUOTA */ #endif
1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 /* SPDX-License-Identifier: GPL-2.0 */ /* * DES & Triple DES EDE key verification helpers */ #ifndef __CRYPTO_INTERNAL_DES_H #define __CRYPTO_INTERNAL_DES_H #include <linux/crypto.h> #include <linux/fips.h> #include <crypto/des.h> #include <crypto/aead.h> #include <crypto/skcipher.h> /** * crypto_des_verify_key - Check whether a DES key is weak * @tfm: the crypto algo * @key: the key buffer * * Returns -EINVAL if the key is weak and the crypto TFM does not permit weak * keys. Otherwise, 0 is returned. * * It is the job of the caller to ensure that the size of the key equals * DES_KEY_SIZE. */ static inline int crypto_des_verify_key(struct crypto_tfm *tfm, const u8 *key) { struct des_ctx tmp; int err; err = des_expand_key(&tmp, key, DES_KEY_SIZE); if (err == -ENOKEY) { if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS) err = -EINVAL; else err = 0; } memzero_explicit(&tmp, sizeof(tmp)); return err; } /* * RFC2451: * * For DES-EDE3, there is no known need to reject weak or * complementation keys. Any weakness is obviated by the use of * multiple keys. * * However, if the first two or last two independent 64-bit keys are * equal (k1 == k2 or k2 == k3), then the DES3 operation is simply the * same as DES. Implementers MUST reject keys that exhibit this * property. * */ static inline int des3_ede_verify_key(const u8 *key, unsigned int key_len, bool check_weak) { int ret = fips_enabled ? -EINVAL : -ENOKEY; u32 K[6]; memcpy(K, key, DES3_EDE_KEY_SIZE); if ((!((K[0] ^ K[2]) | (K[1] ^ K[3])) || !((K[2] ^ K[4]) | (K[3] ^ K[5]))) && (fips_enabled || check_weak)) goto bad; if ((!((K[0] ^ K[4]) | (K[1] ^ K[5]))) && fips_enabled) goto bad; ret = 0; bad: memzero_explicit(K, DES3_EDE_KEY_SIZE); return ret; } /** * crypto_des3_ede_verify_key - Check whether a DES3-EDE key is weak * @tfm: the crypto algo * @key: the key buffer * * Returns -EINVAL if the key is weak and the crypto TFM does not permit weak * keys or when running in FIPS mode. Otherwise, 0 is returned. Note that some * keys are rejected in FIPS mode even if weak keys are permitted by the TFM * flags. * * It is the job of the caller to ensure that the size of the key equals * DES3_EDE_KEY_SIZE. */ static inline int crypto_des3_ede_verify_key(struct crypto_tfm *tfm, const u8 *key) { return des3_ede_verify_key(key, DES3_EDE_KEY_SIZE, crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); } static inline int verify_skcipher_des_key(struct crypto_skcipher *tfm, const u8 *key) { return crypto_des_verify_key(crypto_skcipher_tfm(tfm), key); } static inline int verify_skcipher_des3_key(struct crypto_skcipher *tfm, const u8 *key) { return crypto_des3_ede_verify_key(crypto_skcipher_tfm(tfm), key); } static inline int verify_aead_des_key(struct crypto_aead *tfm, const u8 *key, int keylen) { if (keylen != DES_KEY_SIZE) return -EINVAL; return crypto_des_verify_key(crypto_aead_tfm(tfm), key); } static inline int verify_aead_des3_key(struct crypto_aead *tfm, const u8 *key, int keylen) { if (keylen != DES3_EDE_KEY_SIZE) return -EINVAL; return crypto_des3_ede_verify_key(crypto_aead_tfm(tfm), key); } #endif /* __CRYPTO_INTERNAL_DES_H */
3 9 3 3 3 3 2 3 4 1 1 2 4 1 3 3 166 4 168 224 224 220 169 4 150 169 1 1 25 225 226 150 9 1 8 2 9 138 1 1 74 55 1 2 2 7 7 160 1 3 2 1 1 134 121 4 19 29 16 12 8 16 11 12 8 6 38 27 54 10 3 5 21 6 3 10 12 10 19 1 19 18 16 3 17 2 19 6 14 13 1 6 11 7 14 6 17 2 17 2 19 19 19 19 19 7 16 3 18 2 18 18 18 1 2 37 11 1 27 28 17 41 40 36 3 36 3 36 3 35 3 35 3 37 26 1 5 7 5 7 16 3 7 1 7 4 7 4 34 34 25 9 31 2 2 31 25 5 5 22 11 15 2 13 6 6 8 2 5 6 11 8 3 7 5 2 4 4 3 1 1 1 4 7 30 11 1 11 11 50 52 27 11 9 2 3 9 11 8 1 2 5 4 9 2 2 5 4 2 2 2 2 2 4 1 1 2 2 2 6 6 5 2 4 2 3 3 3 7 7 11 1 9 4 1 1 3 3 1 1 1 1 1 4 4 4 17 1 17 17 4 4 8 8 7 20 1 10 1 18 39 40 40 39 40 40 40 3 36 38 8 3 5 8 4 3 3 39 1 1 38 16 3 1 1 11 10 1 11 7 4 10 1 10 10 1 11 1 5 7 3 1 3 2 1 3 2 1 2 2 1 1 5 13 10 3 1 1 4 16 3 1 6 6 7 5 2 7 2 5 1 6 3 1 1 1 1 1 2 2 2 2 2 4 4 2 169 153 154 10 1 1 1 10 5 10 10 10 5 5 3 3 3 10 10 10 5 5 3 3 3 2 38 38 24 12 12 2 45 45 40 40 1 2 19 19 17 2 19 24 11 10 5 24 24 23 24 11 10 5 24 32 15 3 3 11 1 1 1 1 13 3 1 2 172 1 1 169 13 170 140 1 36 6 1 2 1 5 5 1 5 5 5 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/key/af_key.c An implementation of PF_KEYv2 sockets. * * Authors: Maxim Giryaev <gem@asplinux.ru> * David S. Miller <davem@redhat.com> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * Kazunori MIYAZAWA / USAGI Project <miyazawa@linux-ipv6.org> * Derek Atkins <derek@ihtfp.com> */ #include <linux/capability.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/socket.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/xfrm.h> #include <net/sock.h> #define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x)) #define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x)) static unsigned int pfkey_net_id __read_mostly; struct netns_pfkey { /* List of all pfkey sockets. */ struct hlist_head table; atomic_t socks_nr; }; static DEFINE_MUTEX(pfkey_mutex); #define DUMMY_MARK 0 static const struct xfrm_mark dummy_mark = {0, 0}; struct pfkey_sock { /* struct sock must be the first member of struct pfkey_sock */ struct sock sk; int registered; int promisc; struct { uint8_t msg_version; uint32_t msg_portid; int (*dump)(struct pfkey_sock *sk); void (*done)(struct pfkey_sock *sk); union { struct xfrm_policy_walk policy; struct xfrm_state_walk state; } u; struct sk_buff *skb; } dump; struct mutex dump_lock; }; static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, xfrm_address_t *saddr, xfrm_address_t *daddr, u16 *family); static inline struct pfkey_sock *pfkey_sk(struct sock *sk) { return (struct pfkey_sock *)sk; } static int pfkey_can_dump(const struct sock *sk) { if (3 * atomic_read(&sk->sk_rmem_alloc) <= 2 * sk->sk_rcvbuf) return 1; return 0; } static void pfkey_terminate_dump(struct pfkey_sock *pfk) { if (pfk->dump.dump) { if (pfk->dump.skb) { kfree_skb(pfk->dump.skb); pfk->dump.skb = NULL; } pfk->dump.done(pfk); pfk->dump.dump = NULL; pfk->dump.done = NULL; } } static void pfkey_sock_destruct(struct sock *sk) { struct net *net = sock_net(sk); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); pfkey_terminate_dump(pfkey_sk(sk)); skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive pfkey socket: %p\n", sk); return; } WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); atomic_dec(&net_pfkey->socks_nr); } static const struct proto_ops pfkey_ops; static void pfkey_insert(struct sock *sk) { struct net *net = sock_net(sk); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); mutex_lock(&pfkey_mutex); sk_add_node_rcu(sk, &net_pfkey->table); mutex_unlock(&pfkey_mutex); } static void pfkey_remove(struct sock *sk) { mutex_lock(&pfkey_mutex); sk_del_node_init_rcu(sk); mutex_unlock(&pfkey_mutex); } static struct proto key_proto = { .name = "KEY", .owner = THIS_MODULE, .obj_size = sizeof(struct pfkey_sock), }; static int pfkey_create(struct net *net, struct socket *sock, int protocol, int kern) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; struct pfkey_sock *pfk; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (protocol != PF_KEY_V2) return -EPROTONOSUPPORT; sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern); if (sk == NULL) return -ENOMEM; pfk = pfkey_sk(sk); mutex_init(&pfk->dump_lock); sock->ops = &pfkey_ops; sock_init_data(sock, sk); sk->sk_family = PF_KEY; sk->sk_destruct = pfkey_sock_destruct; atomic_inc(&net_pfkey->socks_nr); pfkey_insert(sk); return 0; } static int pfkey_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return 0; pfkey_remove(sk); sock_orphan(sk); sock->sk = NULL; skb_queue_purge(&sk->sk_write_queue); synchronize_rcu(); sock_put(sk); return 0; } static int pfkey_broadcast_one(struct sk_buff *skb, gfp_t allocation, struct sock *sk) { int err = -ENOBUFS; if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) return err; skb = skb_clone(skb, allocation); if (skb) { skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); err = 0; } return err; } /* Send SKB to all pfkey sockets matching selected criteria. */ #define BROADCAST_ALL 0 #define BROADCAST_ONE 1 #define BROADCAST_REGISTERED 2 #define BROADCAST_PROMISC_ONLY 4 static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, int broadcast_flags, struct sock *one_sk, struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; int err = -ESRCH; /* XXX Do we need something like netlink_overrun? I think * XXX PF_KEY socket apps will not mind current behavior. */ if (!skb) return -ENOMEM; rcu_read_lock(); sk_for_each_rcu(sk, &net_pfkey->table) { struct pfkey_sock *pfk = pfkey_sk(sk); int err2; /* Yes, it means that if you are meant to receive this * pfkey message you receive it twice as promiscuous * socket. */ if (pfk->promisc) pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* the exact target will be processed later */ if (sk == one_sk) continue; if (broadcast_flags != BROADCAST_ALL) { if (broadcast_flags & BROADCAST_PROMISC_ONLY) continue; if ((broadcast_flags & BROADCAST_REGISTERED) && !pfk->registered) continue; if (broadcast_flags & BROADCAST_ONE) continue; } err2 = pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* Error is cleared after successful sending to at least one * registered KM */ if ((broadcast_flags & BROADCAST_REGISTERED) && err) err = err2; } rcu_read_unlock(); if (one_sk != NULL) err = pfkey_broadcast_one(skb, allocation, one_sk); kfree_skb(skb); return err; } static int pfkey_do_dump(struct pfkey_sock *pfk) { struct sadb_msg *hdr; int rc; mutex_lock(&pfk->dump_lock); if (!pfk->dump.dump) { rc = 0; goto out; } rc = pfk->dump.dump(pfk); if (rc == -ENOBUFS) { rc = 0; goto out; } if (pfk->dump.skb) { if (!pfkey_can_dump(&pfk->sk)) { rc = 0; goto out; } hdr = (struct sadb_msg *) pfk->dump.skb->data; hdr->sadb_msg_seq = 0; hdr->sadb_msg_errno = rc; pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = NULL; } pfkey_terminate_dump(pfk); out: mutex_unlock(&pfk->dump_lock); return rc; } static inline void pfkey_hdr_dup(struct sadb_msg *new, const struct sadb_msg *orig) { *new = *orig; } static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk) { struct sk_buff *skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); struct sadb_msg *hdr; if (!skb) return -ENOBUFS; /* Woe be to the platform trying to support PFKEY yet * having normal errnos outside the 1-255 range, inclusive. */ err = -err; if (err == ERESTARTSYS || err == ERESTARTNOHAND || err == ERESTARTNOINTR) err = EINTR; if (err >= 512) err = EINVAL; BUG_ON(err <= 0 || err >= 256); hdr = skb_put(skb, sizeof(struct sadb_msg)); pfkey_hdr_dup(hdr, orig); hdr->sadb_msg_errno = (uint8_t) err; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk)); return 0; } static const u8 sadb_ext_min_len[] = { [SADB_EXT_RESERVED] = (u8) 0, [SADB_EXT_SA] = (u8) sizeof(struct sadb_sa), [SADB_EXT_LIFETIME_CURRENT] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_LIFETIME_HARD] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_LIFETIME_SOFT] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_ADDRESS_SRC] = (u8) sizeof(struct sadb_address), [SADB_EXT_ADDRESS_DST] = (u8) sizeof(struct sadb_address), [SADB_EXT_ADDRESS_PROXY] = (u8) sizeof(struct sadb_address), [SADB_EXT_KEY_AUTH] = (u8) sizeof(struct sadb_key), [SADB_EXT_KEY_ENCRYPT] = (u8) sizeof(struct sadb_key), [SADB_EXT_IDENTITY_SRC] = (u8) sizeof(struct sadb_ident), [SADB_EXT_IDENTITY_DST] = (u8) sizeof(struct sadb_ident), [SADB_EXT_SENSITIVITY] = (u8) sizeof(struct sadb_sens), [SADB_EXT_PROPOSAL] = (u8) sizeof(struct sadb_prop), [SADB_EXT_SUPPORTED_AUTH] = (u8) sizeof(struct sadb_supported), [SADB_EXT_SUPPORTED_ENCRYPT] = (u8) sizeof(struct sadb_supported), [SADB_EXT_SPIRANGE] = (u8) sizeof(struct sadb_spirange), [SADB_X_EXT_KMPRIVATE] = (u8) sizeof(struct sadb_x_kmprivate), [SADB_X_EXT_POLICY] = (u8) sizeof(struct sadb_x_policy), [SADB_X_EXT_SA2] = (u8) sizeof(struct sadb_x_sa2), [SADB_X_EXT_NAT_T_TYPE] = (u8) sizeof(struct sadb_x_nat_t_type), [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx), [SADB_X_EXT_KMADDRESS] = (u8) sizeof(struct sadb_x_kmaddress), [SADB_X_EXT_FILTER] = (u8) sizeof(struct sadb_x_filter), }; /* Verify sadb_address_{len,prefixlen} against sa_family. */ static int verify_address_len(const void *p) { const struct sadb_address *sp = p; const struct sockaddr *addr = (const struct sockaddr *)(sp + 1); const struct sockaddr_in *sin; #if IS_ENABLED(CONFIG_IPV6) const struct sockaddr_in6 *sin6; #endif int len; if (sp->sadb_address_len < DIV_ROUND_UP(sizeof(*sp) + offsetofend(typeof(*addr), sa_family), sizeof(uint64_t))) return -EINVAL; switch (addr->sa_family) { case AF_INET: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), sizeof(uint64_t)); if (sp->sadb_address_len != len || sp->sadb_address_prefixlen > 32) return -EINVAL; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin6), sizeof(uint64_t)); if (sp->sadb_address_len != len || sp->sadb_address_prefixlen > 128) return -EINVAL; break; #endif default: /* It is user using kernel to keep track of security * associations for another protocol, such as * OSPF/RSVP/RIPV2/MIP. It is user's job to verify * lengths. * * XXX Actually, association/policy database is not yet * XXX able to cope with arbitrary sockaddr families. * XXX When it can, remove this -EINVAL. -DaveM */ return -EINVAL; } return 0; } static inline int sadb_key_len(const struct sadb_key *key) { int key_bytes = DIV_ROUND_UP(key->sadb_key_bits, 8); return DIV_ROUND_UP(sizeof(struct sadb_key) + key_bytes, sizeof(uint64_t)); } static int verify_key_len(const void *p) { const struct sadb_key *key = p; if (sadb_key_len(key) > key->sadb_key_len) return -EINVAL; return 0; } static inline int pfkey_sec_ctx_len(const struct sadb_x_sec_ctx *sec_ctx) { return DIV_ROUND_UP(sizeof(struct sadb_x_sec_ctx) + sec_ctx->sadb_x_ctx_len, sizeof(uint64_t)); } static inline int verify_sec_ctx_len(const void *p) { const struct sadb_x_sec_ctx *sec_ctx = p; int len = sec_ctx->sadb_x_ctx_len; if (len > PAGE_SIZE) return -EINVAL; len = pfkey_sec_ctx_len(sec_ctx); if (sec_ctx->sadb_x_sec_len != len) return -EINVAL; return 0; } static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(const struct sadb_x_sec_ctx *sec_ctx, gfp_t gfp) { struct xfrm_user_sec_ctx *uctx = NULL; int ctx_size = sec_ctx->sadb_x_ctx_len; uctx = kmalloc((sizeof(*uctx)+ctx_size), gfp); if (!uctx) return NULL; uctx->len = pfkey_sec_ctx_len(sec_ctx); uctx->exttype = sec_ctx->sadb_x_sec_exttype; uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi; uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg; uctx->ctx_len = sec_ctx->sadb_x_ctx_len; memcpy(uctx + 1, sec_ctx + 1, uctx->ctx_len); return uctx; } static int present_and_same_family(const struct sadb_address *src, const struct sadb_address *dst) { const struct sockaddr *s_addr, *d_addr; if (!src || !dst) return 0; s_addr = (const struct sockaddr *)(src + 1); d_addr = (const struct sockaddr *)(dst + 1); if (s_addr->sa_family != d_addr->sa_family) return 0; if (s_addr->sa_family != AF_INET #if IS_ENABLED(CONFIG_IPV6) && s_addr->sa_family != AF_INET6 #endif ) return 0; return 1; } static int parse_exthdrs(struct sk_buff *skb, const struct sadb_msg *hdr, void **ext_hdrs) { const char *p = (char *) hdr; int len = skb->len; len -= sizeof(*hdr); p += sizeof(*hdr); while (len > 0) { const struct sadb_ext *ehdr = (const struct sadb_ext *) p; uint16_t ext_type; int ext_len; if (len < sizeof(*ehdr)) return -EINVAL; ext_len = ehdr->sadb_ext_len; ext_len *= sizeof(uint64_t); ext_type = ehdr->sadb_ext_type; if (ext_len < sizeof(uint64_t) || ext_len > len || ext_type == SADB_EXT_RESERVED) return -EINVAL; if (ext_type <= SADB_EXT_MAX) { int min = (int) sadb_ext_min_len[ext_type]; if (ext_len < min) return -EINVAL; if (ext_hdrs[ext_type-1] != NULL) return -EINVAL; switch (ext_type) { case SADB_EXT_ADDRESS_SRC: case SADB_EXT_ADDRESS_DST: case SADB_EXT_ADDRESS_PROXY: case SADB_X_EXT_NAT_T_OA: if (verify_address_len(p)) return -EINVAL; break; case SADB_X_EXT_SEC_CTX: if (verify_sec_ctx_len(p)) return -EINVAL; break; case SADB_EXT_KEY_AUTH: case SADB_EXT_KEY_ENCRYPT: if (verify_key_len(p)) return -EINVAL; break; default: break; } ext_hdrs[ext_type-1] = (void *) p; } p += ext_len; len -= ext_len; } return 0; } static uint16_t pfkey_satype2proto(uint8_t satype) { switch (satype) { case SADB_SATYPE_UNSPEC: return IPSEC_PROTO_ANY; case SADB_SATYPE_AH: return IPPROTO_AH; case SADB_SATYPE_ESP: return IPPROTO_ESP; case SADB_X_SATYPE_IPCOMP: return IPPROTO_COMP; default: return 0; } /* NOTREACHED */ } static uint8_t pfkey_proto2satype(uint16_t proto) { switch (proto) { case IPPROTO_AH: return SADB_SATYPE_AH; case IPPROTO_ESP: return SADB_SATYPE_ESP; case IPPROTO_COMP: return SADB_X_SATYPE_IPCOMP; default: return 0; } /* NOTREACHED */ } /* BTW, this scheme means that there is no way with PFKEY2 sockets to * say specifically 'just raw sockets' as we encode them as 255. */ static uint8_t pfkey_proto_to_xfrm(uint8_t proto) { return proto == IPSEC_PROTO_ANY ? 0 : proto; } static uint8_t pfkey_proto_from_xfrm(uint8_t proto) { return proto ? proto : IPSEC_PROTO_ANY; } static inline int pfkey_sockaddr_len(sa_family_t family) { switch (family) { case AF_INET: return sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return sizeof(struct sockaddr_in6); #endif } return 0; } static int pfkey_sockaddr_extract(const struct sockaddr *sa, xfrm_address_t *xaddr) { switch (sa->sa_family) { case AF_INET: xaddr->a4 = ((struct sockaddr_in *)sa)->sin_addr.s_addr; return AF_INET; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: memcpy(xaddr->a6, &((struct sockaddr_in6 *)sa)->sin6_addr, sizeof(struct in6_addr)); return AF_INET6; #endif } return 0; } static int pfkey_sadb_addr2xfrm_addr(const struct sadb_address *addr, xfrm_address_t *xaddr) { return pfkey_sockaddr_extract((struct sockaddr *)(addr + 1), xaddr); } static struct xfrm_state *pfkey_xfrm_state_lookup(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs) { const struct sadb_sa *sa; const struct sadb_address *addr; uint16_t proto; unsigned short family; xfrm_address_t *xaddr; sa = ext_hdrs[SADB_EXT_SA - 1]; if (sa == NULL) return NULL; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return NULL; /* sadb_address_len should be checked by caller */ addr = ext_hdrs[SADB_EXT_ADDRESS_DST - 1]; if (addr == NULL) return NULL; family = ((const struct sockaddr *)(addr + 1))->sa_family; switch (family) { case AF_INET: xaddr = (xfrm_address_t *)&((const struct sockaddr_in *)(addr + 1))->sin_addr; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: xaddr = (xfrm_address_t *)&((const struct sockaddr_in6 *)(addr + 1))->sin6_addr; break; #endif default: xaddr = NULL; } if (!xaddr) return NULL; return xfrm_state_lookup(net, DUMMY_MARK, xaddr, sa->sadb_sa_spi, proto, family); } #define PFKEY_ALIGN8(a) (1 + (((a) - 1) | (8 - 1))) static int pfkey_sockaddr_size(sa_family_t family) { return PFKEY_ALIGN8(pfkey_sockaddr_len(family)); } static inline int pfkey_mode_from_xfrm(int mode) { switch(mode) { case XFRM_MODE_TRANSPORT: return IPSEC_MODE_TRANSPORT; case XFRM_MODE_TUNNEL: return IPSEC_MODE_TUNNEL; case XFRM_MODE_BEET: return IPSEC_MODE_BEET; default: return -1; } } static inline int pfkey_mode_to_xfrm(int mode) { switch(mode) { case IPSEC_MODE_ANY: /*XXX*/ case IPSEC_MODE_TRANSPORT: return XFRM_MODE_TRANSPORT; case IPSEC_MODE_TUNNEL: return XFRM_MODE_TUNNEL; case IPSEC_MODE_BEET: return XFRM_MODE_BEET; default: return -1; } } static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port, struct sockaddr *sa, unsigned short family) { switch (family) { case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)sa; sin->sin_family = AF_INET; sin->sin_port = port; sin->sin_addr.s_addr = xaddr->a4; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return 32; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; sin6->sin6_family = AF_INET6; sin6->sin6_port = port; sin6->sin6_flowinfo = 0; sin6->sin6_addr = xaddr->in6; sin6->sin6_scope_id = 0; return 128; } #endif } return 0; } static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, int add_keys, int hsc) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_sa *sa; struct sadb_lifetime *lifetime; struct sadb_address *addr; struct sadb_key *key; struct sadb_x_sa2 *sa2; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; int size; int auth_key_size = 0; int encrypt_key_size = 0; int sockaddr_size; struct xfrm_encap_tmpl *natt = NULL; int mode; /* address family check */ sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return ERR_PTR(-EINVAL); /* base, SA, (lifetime (HSC),) address(SD), (address(P),) key(AE), (identity(SD),) (sensitivity)> */ size = sizeof(struct sadb_msg) +sizeof(struct sadb_sa) + sizeof(struct sadb_lifetime) + ((hsc & 1) ? sizeof(struct sadb_lifetime) : 0) + ((hsc & 2) ? sizeof(struct sadb_lifetime) : 0) + sizeof(struct sadb_address)*2 + sockaddr_size*2 + sizeof(struct sadb_x_sa2); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } /* identity & sensitivity */ if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) size += sizeof(struct sadb_address) + sockaddr_size; if (add_keys) { if (x->aalg && x->aalg->alg_key_len) { auth_key_size = PFKEY_ALIGN8((x->aalg->alg_key_len + 7) / 8); size += sizeof(struct sadb_key) + auth_key_size; } if (x->ealg && x->ealg->alg_key_len) { encrypt_key_size = PFKEY_ALIGN8((x->ealg->alg_key_len+7) / 8); size += sizeof(struct sadb_key) + encrypt_key_size; } } if (x->encap) natt = x->encap; if (natt && natt->encap_type) { size += sizeof(struct sadb_x_nat_t_type); size += sizeof(struct sadb_x_nat_t_port); size += sizeof(struct sadb_x_nat_t_port); } skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return ERR_PTR(-ENOBUFS); /* call should fill header later */ hdr = skb_put(skb, sizeof(struct sadb_msg)); memset(hdr, 0, size); /* XXX do we need this ? */ hdr->sadb_msg_len = size / sizeof(uint64_t); /* sa */ sa = skb_put(skb, sizeof(struct sadb_sa)); sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); sa->sadb_sa_exttype = SADB_EXT_SA; sa->sadb_sa_spi = x->id.spi; sa->sadb_sa_replay = x->props.replay_window; switch (x->km.state) { case XFRM_STATE_VALID: sa->sadb_sa_state = x->km.dying ? SADB_SASTATE_DYING : SADB_SASTATE_MATURE; break; case XFRM_STATE_ACQ: sa->sadb_sa_state = SADB_SASTATE_LARVAL; break; default: sa->sadb_sa_state = SADB_SASTATE_DEAD; break; } sa->sadb_sa_auth = 0; if (x->aalg) { struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0); sa->sadb_sa_auth = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } sa->sadb_sa_encrypt = 0; BUG_ON(x->ealg && x->calg); if (x->ealg) { struct xfrm_algo_desc *a = xfrm_ealg_get_byname(x->ealg->alg_name, 0); sa->sadb_sa_encrypt = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } /* KAME compatible: sadb_sa_encrypt is overloaded with calg id */ if (x->calg) { struct xfrm_algo_desc *a = xfrm_calg_get_byname(x->calg->alg_name, 0); sa->sadb_sa_encrypt = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } sa->sadb_sa_flags = 0; if (x->props.flags & XFRM_STATE_NOECN) sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN; if (x->props.flags & XFRM_STATE_DECAP_DSCP) sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP; if (x->props.flags & XFRM_STATE_NOPMTUDISC) sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC; /* hard time */ if (hsc & 2) { lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.hard_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.hard_byte_limit); lifetime->sadb_lifetime_addtime = x->lft.hard_add_expires_seconds; lifetime->sadb_lifetime_usetime = x->lft.hard_use_expires_seconds; } /* soft time */ if (hsc & 1) { lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.soft_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.soft_byte_limit); lifetime->sadb_lifetime_addtime = x->lft.soft_add_expires_seconds; lifetime->sadb_lifetime_usetime = x->lft.soft_use_expires_seconds; } /* current time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lifetime->sadb_lifetime_allocations = x->curlft.packets; lifetime->sadb_lifetime_bytes = x->curlft.bytes; lifetime->sadb_lifetime_addtime = x->curlft.add_time; lifetime->sadb_lifetime_usetime = x->curlft.use_time; /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; /* "if the ports are non-zero, then the sadb_address_proto field, normally zero, MUST be filled in with the transport protocol's number." - RFC2367 */ addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); BUG_ON(!addr->sadb_address_prefixlen); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->id.daddr, 0, (struct sockaddr *) (addr + 1), x->props.family); BUG_ON(!addr->sadb_address_prefixlen); if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) { addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY; addr->sadb_address_proto = pfkey_proto_from_xfrm(x->sel.proto); addr->sadb_address_prefixlen = x->sel.prefixlen_s; addr->sadb_address_reserved = 0; pfkey_sockaddr_fill(&x->sel.saddr, x->sel.sport, (struct sockaddr *) (addr + 1), x->props.family); } /* auth key */ if (add_keys && auth_key_size) { key = skb_put(skb, sizeof(struct sadb_key) + auth_key_size); key->sadb_key_len = (sizeof(struct sadb_key) + auth_key_size) / sizeof(uint64_t); key->sadb_key_exttype = SADB_EXT_KEY_AUTH; key->sadb_key_bits = x->aalg->alg_key_len; key->sadb_key_reserved = 0; memcpy(key + 1, x->aalg->alg_key, (x->aalg->alg_key_len+7)/8); } /* encrypt key */ if (add_keys && encrypt_key_size) { key = skb_put(skb, sizeof(struct sadb_key) + encrypt_key_size); key->sadb_key_len = (sizeof(struct sadb_key) + encrypt_key_size) / sizeof(uint64_t); key->sadb_key_exttype = SADB_EXT_KEY_ENCRYPT; key->sadb_key_bits = x->ealg->alg_key_len; key->sadb_key_reserved = 0; memcpy(key + 1, x->ealg->alg_key, (x->ealg->alg_key_len+7)/8); } /* sa */ sa2 = skb_put(skb, sizeof(struct sadb_x_sa2)); sa2->sadb_x_sa2_len = sizeof(struct sadb_x_sa2)/sizeof(uint64_t); sa2->sadb_x_sa2_exttype = SADB_X_EXT_SA2; if ((mode = pfkey_mode_from_xfrm(x->props.mode)) < 0) { kfree_skb(skb); return ERR_PTR(-EINVAL); } sa2->sadb_x_sa2_mode = mode; sa2->sadb_x_sa2_reserved1 = 0; sa2->sadb_x_sa2_reserved2 = 0; sa2->sadb_x_sa2_sequence = 0; sa2->sadb_x_sa2_reqid = x->props.reqid; if (natt && natt->encap_type) { struct sadb_x_nat_t_type *n_type; struct sadb_x_nat_t_port *n_port; /* type */ n_type = skb_put(skb, sizeof(*n_type)); n_type->sadb_x_nat_t_type_len = sizeof(*n_type)/sizeof(uint64_t); n_type->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE; n_type->sadb_x_nat_t_type_type = natt->encap_type; n_type->sadb_x_nat_t_type_reserved[0] = 0; n_type->sadb_x_nat_t_type_reserved[1] = 0; n_type->sadb_x_nat_t_type_reserved[2] = 0; /* source port */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; n_port->sadb_x_nat_t_port_port = natt->encap_sport; n_port->sadb_x_nat_t_port_reserved = 0; /* dest port */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; n_port->sadb_x_nat_t_port_port = natt->encap_dport; n_port->sadb_x_nat_t_port_reserved = 0; } /* security context */ if (xfrm_ctx) { sec_ctx = skb_put(skb, sizeof(struct sadb_x_sec_ctx) + ctx_size); sec_ctx->sadb_x_sec_len = (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } return skb; } static inline struct sk_buff *pfkey_xfrm_state2msg(const struct xfrm_state *x) { struct sk_buff *skb; skb = __pfkey_xfrm_state2msg(x, 1, 3); return skb; } static inline struct sk_buff *pfkey_xfrm_state2msg_expire(const struct xfrm_state *x, int hsc) { return __pfkey_xfrm_state2msg(x, 0, hsc); } static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct xfrm_state *x; const struct sadb_lifetime *lifetime; const struct sadb_sa *sa; const struct sadb_key *key; const struct sadb_x_sec_ctx *sec_ctx; uint16_t proto; int err; sa = ext_hdrs[SADB_EXT_SA - 1]; if (!sa || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return ERR_PTR(-EINVAL); if (hdr->sadb_msg_satype == SADB_SATYPE_ESP && !ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]) return ERR_PTR(-EINVAL); if (hdr->sadb_msg_satype == SADB_SATYPE_AH && !ext_hdrs[SADB_EXT_KEY_AUTH-1]) return ERR_PTR(-EINVAL); if (!!ext_hdrs[SADB_EXT_LIFETIME_HARD-1] != !!ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) return ERR_PTR(-EINVAL); proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return ERR_PTR(-EINVAL); /* default error is no buffer space */ err = -ENOBUFS; /* RFC2367: Only SADB_SASTATE_MATURE SAs may be submitted in an SADB_ADD message. SADB_SASTATE_LARVAL SAs are created by SADB_GETSPI and it is not sensible to add a new SA in the DYING or SADB_SASTATE_DEAD state. Therefore, the sadb_sa_state field of all submitted SAs MUST be SADB_SASTATE_MATURE and the kernel MUST return an error if this is not true. However, KAME setkey always uses SADB_SASTATE_LARVAL. Hence, we have to _ignore_ sadb_sa_state, which is also reasonable. */ if (sa->sadb_sa_auth > SADB_AALG_MAX || (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP && sa->sadb_sa_encrypt > SADB_X_CALG_MAX) || sa->sadb_sa_encrypt > SADB_EALG_MAX) return ERR_PTR(-EINVAL); key = ext_hdrs[SADB_EXT_KEY_AUTH - 1]; if (key != NULL && sa->sadb_sa_auth != SADB_X_AALG_NULL && key->sadb_key_bits == 0) return ERR_PTR(-EINVAL); key = ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; if (key != NULL && sa->sadb_sa_encrypt != SADB_EALG_NULL && key->sadb_key_bits == 0) return ERR_PTR(-EINVAL); x = xfrm_state_alloc(net); if (x == NULL) return ERR_PTR(-ENOBUFS); x->id.proto = proto; x->id.spi = sa->sadb_sa_spi; x->props.replay_window = min_t(unsigned int, sa->sadb_sa_replay, (sizeof(x->replay.bitmap) * 8)); if (sa->sadb_sa_flags & SADB_SAFLAGS_NOECN) x->props.flags |= XFRM_STATE_NOECN; if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP) x->props.flags |= XFRM_STATE_DECAP_DSCP; if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC) x->props.flags |= XFRM_STATE_NOPMTUDISC; lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD - 1]; if (lifetime != NULL) { x->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); x->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); x->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; } lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT - 1]; if (lifetime != NULL) { x->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); x->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) goto out; err = security_xfrm_state_alloc(x, uctx); kfree(uctx); if (err) goto out; } err = -ENOBUFS; key = ext_hdrs[SADB_EXT_KEY_AUTH - 1]; if (sa->sadb_sa_auth) { int keysize = 0; struct xfrm_algo_desc *a = xfrm_aalg_get_byid(sa->sadb_sa_auth); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } if (key) keysize = (key->sadb_key_bits + 7) / 8; x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL); if (!x->aalg) { err = -ENOMEM; goto out; } strcpy(x->aalg->alg_name, a->name); x->aalg->alg_key_len = 0; if (key) { x->aalg->alg_key_len = key->sadb_key_bits; memcpy(x->aalg->alg_key, key+1, keysize); } x->aalg->alg_trunc_len = a->uinfo.auth.icv_truncbits; x->props.aalgo = sa->sadb_sa_auth; /* x->algo.flags = sa->sadb_sa_flags; */ } if (sa->sadb_sa_encrypt) { if (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP) { struct xfrm_algo_desc *a = xfrm_calg_get_byid(sa->sadb_sa_encrypt); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL); if (!x->calg) { err = -ENOMEM; goto out; } strcpy(x->calg->alg_name, a->name); x->props.calgo = sa->sadb_sa_encrypt; } else { int keysize = 0; struct xfrm_algo_desc *a = xfrm_ealg_get_byid(sa->sadb_sa_encrypt); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; if (key) keysize = (key->sadb_key_bits + 7) / 8; x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL); if (!x->ealg) { err = -ENOMEM; goto out; } strcpy(x->ealg->alg_name, a->name); x->ealg->alg_key_len = 0; if (key) { x->ealg->alg_key_len = key->sadb_key_bits; memcpy(x->ealg->alg_key, key+1, keysize); } x->props.ealgo = sa->sadb_sa_encrypt; x->geniv = a->uinfo.encr.geniv; } } /* x->algo.flags = sa->sadb_sa_flags; */ x->props.family = pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_SRC-1], &x->props.saddr); pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1], &x->id.daddr); if (ext_hdrs[SADB_X_EXT_SA2-1]) { const struct sadb_x_sa2 *sa2 = ext_hdrs[SADB_X_EXT_SA2-1]; int mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode); if (mode < 0) { err = -EINVAL; goto out; } x->props.mode = mode; x->props.reqid = sa2->sadb_x_sa2_reqid; } if (ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]) { const struct sadb_address *addr = ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]; /* Nobody uses this, but we try. */ x->sel.family = pfkey_sadb_addr2xfrm_addr(addr, &x->sel.saddr); x->sel.prefixlen_s = addr->sadb_address_prefixlen; } if (!x->sel.family) x->sel.family = x->props.family; if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) { const struct sadb_x_nat_t_type* n_type; struct xfrm_encap_tmpl *natt; x->encap = kzalloc(sizeof(*x->encap), GFP_KERNEL); if (!x->encap) { err = -ENOMEM; goto out; } natt = x->encap; n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]; natt->encap_type = n_type->sadb_x_nat_t_type_type; if (ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]) { const struct sadb_x_nat_t_port *n_port = ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]; natt->encap_sport = n_port->sadb_x_nat_t_port_port; } if (ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]) { const struct sadb_x_nat_t_port *n_port = ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]; natt->encap_dport = n_port->sadb_x_nat_t_port_port; } } err = xfrm_init_state(x); if (err) goto out; x->km.seq = hdr->sadb_msg_seq; return x; out: x->km.state = XFRM_STATE_DEAD; xfrm_state_put(x); return ERR_PTR(err); } static int pfkey_reserved(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { return -EOPNOTSUPP; } static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct sk_buff *resp_skb; struct sadb_x_sa2 *sa2; struct sadb_address *saddr, *daddr; struct sadb_msg *out_hdr; struct sadb_spirange *range; struct xfrm_state *x = NULL; int mode; int err; u32 min_spi, max_spi; u32 reqid; u8 proto; unsigned short family; xfrm_address_t *xsaddr = NULL, *xdaddr = NULL; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return -EINVAL; if ((sa2 = ext_hdrs[SADB_X_EXT_SA2-1]) != NULL) { mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode); if (mode < 0) return -EINVAL; reqid = sa2->sadb_x_sa2_reqid; } else { mode = 0; reqid = 0; } saddr = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; daddr = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; family = ((struct sockaddr *)(saddr + 1))->sa_family; switch (family) { case AF_INET: xdaddr = (xfrm_address_t *)&((struct sockaddr_in *)(daddr + 1))->sin_addr.s_addr; xsaddr = (xfrm_address_t *)&((struct sockaddr_in *)(saddr + 1))->sin_addr.s_addr; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: xdaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(daddr + 1))->sin6_addr; xsaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(saddr + 1))->sin6_addr; break; #endif } if (hdr->sadb_msg_seq) { x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); if (x && !xfrm_addr_equal(&x->id.daddr, xdaddr, family)) { xfrm_state_put(x); x = NULL; } } if (!x) x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, UINT_MAX, proto, xdaddr, xsaddr, 1, family); if (x == NULL) return -ENOENT; min_spi = 0x100; max_spi = 0x0fffffff; range = ext_hdrs[SADB_EXT_SPIRANGE-1]; if (range) { min_spi = range->sadb_spirange_min; max_spi = range->sadb_spirange_max; } err = verify_spi_info(x->id.proto, min_spi, max_spi, NULL); if (err) { xfrm_state_put(x); return err; } err = xfrm_alloc_spi(x, min_spi, max_spi, NULL); resp_skb = err ? ERR_PTR(err) : pfkey_xfrm_state2msg(x); if (IS_ERR(resp_skb)) { xfrm_state_put(x); return PTR_ERR(resp_skb); } out_hdr = (struct sadb_msg *) resp_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = SADB_GETSPI; out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; xfrm_state_put(x); pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net); return 0; } static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; if (hdr->sadb_msg_len != sizeof(struct sadb_msg)/8) return -EOPNOTSUPP; if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0) return 0; x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); if (x == NULL) return 0; spin_lock_bh(&x->lock); if (x->km.state == XFRM_STATE_ACQ) x->km.state = XFRM_STATE_ERROR; spin_unlock_bh(&x->lock); xfrm_state_put(x); return 0; } static inline int event2poltype(int event) { switch (event) { case XFRM_MSG_DELPOLICY: return SADB_X_SPDDELETE; case XFRM_MSG_NEWPOLICY: return SADB_X_SPDADD; case XFRM_MSG_UPDPOLICY: return SADB_X_SPDUPDATE; case XFRM_MSG_POLEXPIRE: // return SADB_X_SPDEXPIRE; default: pr_err("pfkey: Unknown policy event %d\n", event); break; } return 0; } static inline int event2keytype(int event) { switch (event) { case XFRM_MSG_DELSA: return SADB_DELETE; case XFRM_MSG_NEWSA: return SADB_ADD; case XFRM_MSG_UPDSA: return SADB_UPDATE; case XFRM_MSG_EXPIRE: return SADB_EXPIRE; default: pr_err("pfkey: Unknown SA event %d\n", event); break; } return 0; } /* ADD/UPD/DEL */ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c) { struct sk_buff *skb; struct sadb_msg *hdr; skb = pfkey_xfrm_state2msg(x); if (IS_ERR(skb)) return PTR_ERR(skb); hdr = (struct sadb_msg *) skb->data; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = event2keytype(c->event); hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x)); return 0; } static int pfkey_add(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; int err; struct km_event c; x = pfkey_msg2xfrm_state(net, hdr, ext_hdrs); if (IS_ERR(x)) return PTR_ERR(x); xfrm_state_hold(x); if (hdr->sadb_msg_type == SADB_ADD) err = xfrm_state_add(x); else err = xfrm_state_update(x); xfrm_audit_state_add(x, err ? 0 : 1, true); if (err < 0) { x->km.state = XFRM_STATE_DEAD; __xfrm_state_put(x); goto out; } if (hdr->sadb_msg_type == SADB_ADD) c.event = XFRM_MSG_NEWSA; else c.event = XFRM_MSG_UPDSA; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; km_state_notify(x, &c); out: xfrm_state_put(x); return err; } static int pfkey_delete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; struct km_event c; int err; if (!ext_hdrs[SADB_EXT_SA-1] || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs); if (x == NULL) return -ESRCH; if ((err = security_xfrm_state_delete(x))) goto out; if (xfrm_state_kern(x)) { err = -EPERM; goto out; } err = xfrm_state_delete(x); if (err < 0) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.event = XFRM_MSG_DELSA; km_state_notify(x, &c); out: xfrm_audit_state_delete(x, err ? 0 : 1, true); xfrm_state_put(x); return err; } static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); __u8 proto; struct sk_buff *out_skb; struct sadb_msg *out_hdr; struct xfrm_state *x; if (!ext_hdrs[SADB_EXT_SA-1] || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs); if (x == NULL) return -ESRCH; out_skb = pfkey_xfrm_state2msg(x); proto = x->id.proto; xfrm_state_put(x); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = SADB_GET; out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); return 0; } static struct sk_buff *compose_sadb_supported(const struct sadb_msg *orig, gfp_t allocation) { struct sk_buff *skb; struct sadb_msg *hdr; int len, auth_len, enc_len, i; auth_len = xfrm_count_pfkey_auth_supported(); if (auth_len) { auth_len *= sizeof(struct sadb_alg); auth_len += sizeof(struct sadb_supported); } enc_len = xfrm_count_pfkey_enc_supported(); if (enc_len) { enc_len *= sizeof(struct sadb_alg); enc_len += sizeof(struct sadb_supported); } len = enc_len + auth_len + sizeof(struct sadb_msg); skb = alloc_skb(len + 16, allocation); if (!skb) goto out_put_algs; hdr = skb_put(skb, sizeof(*hdr)); pfkey_hdr_dup(hdr, orig); hdr->sadb_msg_errno = 0; hdr->sadb_msg_len = len / sizeof(uint64_t); if (auth_len) { struct sadb_supported *sp; struct sadb_alg *ap; sp = skb_put(skb, auth_len); ap = (struct sadb_alg *) (sp + 1); sp->sadb_supported_len = auth_len / sizeof(uint64_t); sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH; for (i = 0; ; i++) { struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg->available) *ap++ = aalg->desc; } } if (enc_len) { struct sadb_supported *sp; struct sadb_alg *ap; sp = skb_put(skb, enc_len); ap = (struct sadb_alg *) (sp + 1); sp->sadb_supported_len = enc_len / sizeof(uint64_t); sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT; for (i = 0; ; i++) { struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (ealg->available) *ap++ = ealg->desc; } } out_put_algs: return skb; } static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); struct sk_buff *supp_skb; if (hdr->sadb_msg_satype > SADB_SATYPE_MAX) return -EINVAL; if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) { if (pfk->registered&(1<<hdr->sadb_msg_satype)) return -EEXIST; pfk->registered |= (1<<hdr->sadb_msg_satype); } mutex_lock(&pfkey_mutex); xfrm_probe_algs(); supp_skb = compose_sadb_supported(hdr, GFP_KERNEL | __GFP_ZERO); mutex_unlock(&pfkey_mutex); if (!supp_skb) { if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) pfk->registered &= ~(1<<hdr->sadb_msg_satype); return -ENOBUFS; } pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, sock_net(sk)); return 0; } static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr) { struct sk_buff *skb; struct sadb_msg *hdr; skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb) return -ENOBUFS; hdr = skb_put_data(skb, ihdr, sizeof(struct sadb_msg)); hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); } static int key_notify_sa_flush(const struct km_event *c) { struct sk_buff *skb; struct sadb_msg *hdr; skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb) return -ENOBUFS; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_satype = pfkey_proto2satype(c->data.proto); hdr->sadb_msg_type = SADB_FLUSH; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); unsigned int proto; struct km_event c; int err, err2; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return -EINVAL; err = xfrm_state_flush(net, proto, true, false); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - go quietly */ err = 0; return err ? err : err2; } c.data.proto = proto; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.event = XFRM_MSG_FLUSHSA; c.net = net; km_state_notify(NULL, &c); return 0; } static int dump_sa(struct xfrm_state *x, int count, void *ptr) { struct pfkey_sock *pfk = ptr; struct sk_buff *out_skb; struct sadb_msg *out_hdr; if (!pfkey_can_dump(&pfk->sk)) return -ENOBUFS; out_skb = pfkey_xfrm_state2msg(x); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = pfk->dump.msg_version; out_hdr->sadb_msg_type = SADB_DUMP; out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = count + 1; out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; return 0; } static int pfkey_dump_sa(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); return xfrm_state_walk(net, &pfk->dump.u.state, dump_sa, (void *) pfk); } static void pfkey_dump_sa_done(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); xfrm_state_walk_done(&pfk->dump.u.state, net); } static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { u8 proto; struct xfrm_address_filter *filter = NULL; struct pfkey_sock *pfk = pfkey_sk(sk); mutex_lock(&pfk->dump_lock); if (pfk->dump.dump != NULL) { mutex_unlock(&pfk->dump_lock); return -EBUSY; } proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) { mutex_unlock(&pfk->dump_lock); return -EINVAL; } if (ext_hdrs[SADB_X_EXT_FILTER - 1]) { struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1]; if ((xfilter->sadb_x_filter_splen > (sizeof(xfrm_address_t) << 3)) || (xfilter->sadb_x_filter_dplen > (sizeof(xfrm_address_t) << 3))) { mutex_unlock(&pfk->dump_lock); return -EINVAL; } filter = kmalloc(sizeof(*filter), GFP_KERNEL); if (filter == NULL) { mutex_unlock(&pfk->dump_lock); return -ENOMEM; } memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr, sizeof(xfrm_address_t)); memcpy(&filter->daddr, &xfilter->sadb_x_filter_daddr, sizeof(xfrm_address_t)); filter->family = xfilter->sadb_x_filter_family; filter->splen = xfilter->sadb_x_filter_splen; filter->dplen = xfilter->sadb_x_filter_dplen; } pfk->dump.msg_version = hdr->sadb_msg_version; pfk->dump.msg_portid = hdr->sadb_msg_pid; pfk->dump.dump = pfkey_dump_sa; pfk->dump.done = pfkey_dump_sa_done; xfrm_state_walk_init(&pfk->dump.u.state, proto, filter); mutex_unlock(&pfk->dump_lock); return pfkey_do_dump(pfk); } static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); int satype = hdr->sadb_msg_satype; bool reset_errno = false; if (hdr->sadb_msg_len == (sizeof(*hdr) / sizeof(uint64_t))) { reset_errno = true; if (satype != 0 && satype != 1) return -EINVAL; pfk->promisc = satype; } if (reset_errno && skb_cloned(skb)) skb = skb_copy(skb, GFP_KERNEL); else skb = skb_clone(skb, GFP_KERNEL); if (reset_errno && skb) { struct sadb_msg *new_hdr = (struct sadb_msg *) skb->data; new_hdr->sadb_msg_errno = 0; } pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk)); return 0; } static int check_reqid(struct xfrm_policy *xp, int dir, int count, void *ptr) { int i; u32 reqid = *(u32*)ptr; for (i=0; i<xp->xfrm_nr; i++) { if (xp->xfrm_vec[i].reqid == reqid) return -EEXIST; } return 0; } static u32 gen_reqid(struct net *net) { struct xfrm_policy_walk walk; u32 start; int rc; static u32 reqid = IPSEC_MANUAL_REQID_MAX; start = reqid; do { ++reqid; if (reqid == 0) reqid = IPSEC_MANUAL_REQID_MAX+1; xfrm_policy_walk_init(&walk, XFRM_POLICY_TYPE_MAIN); rc = xfrm_policy_walk(net, &walk, check_reqid, (void*)&reqid); xfrm_policy_walk_done(&walk, net); if (rc != -EEXIST) return reqid; } while (reqid != start); return 0; } static int parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_policy *pol, struct sadb_x_ipsecrequest *rq) { struct net *net = xp_net(xp); struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr; int mode; if (xp->xfrm_nr >= XFRM_MAX_DEPTH) return -ELOOP; if (rq->sadb_x_ipsecrequest_mode == 0) return -EINVAL; if (!xfrm_id_proto_valid(rq->sadb_x_ipsecrequest_proto)) return -EINVAL; t->id.proto = rq->sadb_x_ipsecrequest_proto; if ((mode = pfkey_mode_to_xfrm(rq->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; t->mode = mode; if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE) { if ((mode == XFRM_MODE_TUNNEL || mode == XFRM_MODE_BEET) && pol->sadb_x_policy_dir == IPSEC_DIR_OUTBOUND) return -EINVAL; t->optional = 1; } else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) { t->reqid = rq->sadb_x_ipsecrequest_reqid; if (t->reqid > IPSEC_MANUAL_REQID_MAX) t->reqid = 0; if (!t->reqid && !(t->reqid = gen_reqid(net))) return -ENOBUFS; } /* addresses present only in tunnel mode */ if (t->mode == XFRM_MODE_TUNNEL) { int err; err = parse_sockaddr_pair( (struct sockaddr *)(rq + 1), rq->sadb_x_ipsecrequest_len - sizeof(*rq), &t->saddr, &t->id.daddr, &t->encap_family); if (err) return err; } else t->encap_family = xp->family; /* No way to set this via kame pfkey */ t->allalgs = 1; xp->xfrm_nr++; return 0; } static int parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) { int err; int len = pol->sadb_x_policy_len*8 - sizeof(struct sadb_x_policy); struct sadb_x_ipsecrequest *rq = (void*)(pol+1); if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy)) return -EINVAL; while (len >= sizeof(*rq)) { if (len < rq->sadb_x_ipsecrequest_len || rq->sadb_x_ipsecrequest_len < sizeof(*rq)) return -EINVAL; if ((err = parse_ipsecrequest(xp, pol, rq)) < 0) return err; len -= rq->sadb_x_ipsecrequest_len; rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len); } return 0; } static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp) { struct xfrm_sec_ctx *xfrm_ctx = xp->security; if (xfrm_ctx) { int len = sizeof(struct sadb_x_sec_ctx); len += xfrm_ctx->ctx_len; return PFKEY_ALIGN8(len); } return 0; } static int pfkey_xfrm_policy2msg_size(const struct xfrm_policy *xp) { const struct xfrm_tmpl *t; int sockaddr_size = pfkey_sockaddr_size(xp->family); int socklen = 0; int i; for (i=0; i<xp->xfrm_nr; i++) { t = xp->xfrm_vec + i; socklen += pfkey_sockaddr_len(t->encap_family); } return sizeof(struct sadb_msg) + (sizeof(struct sadb_lifetime) * 3) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + sizeof(struct sadb_x_policy) + (xp->xfrm_nr * sizeof(struct sadb_x_ipsecrequest)) + (socklen * 2) + pfkey_xfrm_policy2sec_ctx_size(xp); } static struct sk_buff * pfkey_xfrm_policy2msg_prep(const struct xfrm_policy *xp) { struct sk_buff *skb; int size; size = pfkey_xfrm_policy2msg_size(xp); skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return ERR_PTR(-ENOBUFS); return skb; } static int pfkey_xfrm_policy2msg(struct sk_buff *skb, const struct xfrm_policy *xp, int dir) { struct sadb_msg *hdr; struct sadb_address *addr; struct sadb_lifetime *lifetime; struct sadb_x_policy *pol; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int i; int size; int sockaddr_size = pfkey_sockaddr_size(xp->family); int socklen = pfkey_sockaddr_len(xp->family); size = pfkey_xfrm_policy2msg_size(xp); /* call should fill header later */ hdr = skb_put(skb, sizeof(struct sadb_msg)); memset(hdr, 0, size); /* XXX do we need this ? */ /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); addr->sadb_address_prefixlen = xp->selector.prefixlen_s; addr->sadb_address_reserved = 0; if (!pfkey_sockaddr_fill(&xp->selector.saddr, xp->selector.sport, (struct sockaddr *) (addr + 1), xp->family)) BUG(); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); addr->sadb_address_prefixlen = xp->selector.prefixlen_d; addr->sadb_address_reserved = 0; pfkey_sockaddr_fill(&xp->selector.daddr, xp->selector.dport, (struct sockaddr *) (addr + 1), xp->family); /* hard time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.hard_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.hard_byte_limit); lifetime->sadb_lifetime_addtime = xp->lft.hard_add_expires_seconds; lifetime->sadb_lifetime_usetime = xp->lft.hard_use_expires_seconds; /* soft time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.soft_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.soft_byte_limit); lifetime->sadb_lifetime_addtime = xp->lft.soft_add_expires_seconds; lifetime->sadb_lifetime_usetime = xp->lft.soft_use_expires_seconds; /* current time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lifetime->sadb_lifetime_allocations = xp->curlft.packets; lifetime->sadb_lifetime_bytes = xp->curlft.bytes; lifetime->sadb_lifetime_addtime = xp->curlft.add_time; lifetime->sadb_lifetime_usetime = xp->curlft.use_time; pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_DISCARD; if (xp->action == XFRM_POLICY_ALLOW) { if (xp->xfrm_nr) pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; else pol->sadb_x_policy_type = IPSEC_POLICY_NONE; } pol->sadb_x_policy_dir = dir+1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = xp->index; pol->sadb_x_policy_priority = xp->priority; for (i=0; i<xp->xfrm_nr; i++) { const struct xfrm_tmpl *t = xp->xfrm_vec + i; struct sadb_x_ipsecrequest *rq; int req_size; int mode; req_size = sizeof(struct sadb_x_ipsecrequest); if (t->mode == XFRM_MODE_TUNNEL) { socklen = pfkey_sockaddr_len(t->encap_family); req_size += socklen * 2; } else { size -= 2*socklen; } rq = skb_put(skb, req_size); pol->sadb_x_policy_len += req_size/8; memset(rq, 0, sizeof(*rq)); rq->sadb_x_ipsecrequest_len = req_size; rq->sadb_x_ipsecrequest_proto = t->id.proto; if ((mode = pfkey_mode_from_xfrm(t->mode)) < 0) return -EINVAL; rq->sadb_x_ipsecrequest_mode = mode; rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_REQUIRE; if (t->reqid) rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_UNIQUE; if (t->optional) rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_USE; rq->sadb_x_ipsecrequest_reqid = t->reqid; if (t->mode == XFRM_MODE_TUNNEL) { u8 *sa = (void *)(rq + 1); pfkey_sockaddr_fill(&t->saddr, 0, (struct sockaddr *)sa, t->encap_family); pfkey_sockaddr_fill(&t->id.daddr, 0, (struct sockaddr *) (sa + socklen), t->encap_family); } } /* security context */ if ((xfrm_ctx = xp->security)) { int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp); sec_ctx = skb_put(skb, ctx_size); sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_reserved = refcount_read(&xp->refcnt); return 0; } static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c) { struct sk_buff *out_skb; struct sadb_msg *out_hdr; int err; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); return err; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = PF_KEY_V2; if (c->data.byid && c->event == XFRM_MSG_DELPOLICY) out_hdr->sadb_msg_type = SADB_X_SPDDELETE2; else out_hdr->sadb_msg_type = event2poltype(c->event); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = c->seq; out_hdr->sadb_msg_pid = c->portid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp)); return 0; } static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); int err = 0; struct sadb_lifetime *lifetime; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; struct sadb_x_sec_ctx *sec_ctx; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || !ext_hdrs[SADB_X_EXT_POLICY-1]) return -EINVAL; pol = ext_hdrs[SADB_X_EXT_POLICY-1]; if (pol->sadb_x_policy_type > IPSEC_POLICY_IPSEC) return -EINVAL; if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) return -EINVAL; xp = xfrm_policy_alloc(net, GFP_KERNEL); if (xp == NULL) return -ENOBUFS; xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); xp->priority = pol->sadb_x_policy_priority; sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr); xp->selector.family = xp->family; xp->selector.prefixlen_s = sa->sadb_address_prefixlen; xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); xp->selector.sport = ((struct sockaddr_in *)(sa+1))->sin_port; if (xp->selector.sport) xp->selector.sport_mask = htons(0xffff); sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.daddr); xp->selector.prefixlen_d = sa->sadb_address_prefixlen; /* Amusing, we set this twice. KAME apps appear to set same value * in both addresses. */ xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); xp->selector.dport = ((struct sockaddr_in *)(sa+1))->sin_port; if (xp->selector.dport) xp->selector.dport_mask = htons(0xffff); sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) { err = -ENOBUFS; goto out; } err = security_xfrm_policy_alloc(&xp->security, uctx, GFP_KERNEL); kfree(uctx); if (err) goto out; } xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; xp->lft.hard_packet_limit = XFRM_INF; if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD-1]) != NULL) { xp->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); xp->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); xp->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; xp->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; } if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) != NULL) { xp->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); xp->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); xp->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; xp->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } xp->xfrm_nr = 0; if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && (err = parse_ipsecrequests(xp, pol)) < 0) goto out; err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, hdr->sadb_msg_type != SADB_X_SPDUPDATE); xfrm_audit_policy_add(xp, err ? 0 : 1, true); if (err) goto out; if (hdr->sadb_msg_type == SADB_X_SPDUPDATE) c.event = XFRM_MSG_UPDPOLICY; else c.event = XFRM_MSG_NEWPOLICY; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); xfrm_pol_put(xp); return 0; out: xp->walk.dead = 1; xfrm_policy_destroy(xp); return err; } static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); int err; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct xfrm_selector sel; struct km_event c; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *pol_ctx = NULL; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || !ext_hdrs[SADB_X_EXT_POLICY-1]) return -EINVAL; pol = ext_hdrs[SADB_X_EXT_POLICY-1]; if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) return -EINVAL; memset(&sel, 0, sizeof(sel)); sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); sel.prefixlen_s = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.sport = ((struct sockaddr_in *)(sa+1))->sin_port; if (sel.sport) sel.sport_mask = htons(0xffff); sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); sel.prefixlen_d = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.dport = ((struct sockaddr_in *)(sa+1))->sin_port; if (sel.dport) sel.dport_mask = htons(0xffff); sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) return -ENOMEM; err = security_xfrm_policy_alloc(&pol_ctx, uctx, GFP_KERNEL); kfree(uctx); if (err) return err; } xp = xfrm_policy_bysel_ctx(net, &dummy_mark, 0, XFRM_POLICY_TYPE_MAIN, pol->sadb_x_policy_dir - 1, &sel, pol_ctx, 1, &err); security_xfrm_policy_free(pol_ctx); if (xp == NULL) return -ENOENT; xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.data.byid = 0; c.event = XFRM_MSG_DELPOLICY; km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); out: xfrm_pol_put(xp); return err; } static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struct sadb_msg *hdr, int dir) { int err; struct sk_buff *out_skb; struct sadb_msg *out_hdr; err = 0; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) { err = PTR_ERR(out_skb); goto out; } err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); goto out; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = hdr->sadb_msg_type; out_hdr->sadb_msg_satype = 0; out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp)); err = 0; out: return err; } static int pfkey_sockaddr_pair_size(sa_family_t family) { return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2); } static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, xfrm_address_t *saddr, xfrm_address_t *daddr, u16 *family) { int af, socklen; if (ext_len < 2 || ext_len < pfkey_sockaddr_pair_size(sa->sa_family)) return -EINVAL; af = pfkey_sockaddr_extract(sa, saddr); if (!af) return -EINVAL; socklen = pfkey_sockaddr_len(af); if (pfkey_sockaddr_extract((struct sockaddr *) (((u8 *)sa) + socklen), daddr) != af) return -EINVAL; *family = af; return 0; } #ifdef CONFIG_NET_KEY_MIGRATE static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, struct xfrm_migrate *m) { int err; struct sadb_x_ipsecrequest *rq2; int mode; if (len < sizeof(*rq1) || len < rq1->sadb_x_ipsecrequest_len || rq1->sadb_x_ipsecrequest_len < sizeof(*rq1)) return -EINVAL; /* old endoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1), rq1->sadb_x_ipsecrequest_len - sizeof(*rq1), &m->old_saddr, &m->old_daddr, &m->old_family); if (err) return err; rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len); len -= rq1->sadb_x_ipsecrequest_len; if (len <= sizeof(*rq2) || len < rq2->sadb_x_ipsecrequest_len || rq2->sadb_x_ipsecrequest_len < sizeof(*rq2)) return -EINVAL; /* new endpoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1), rq2->sadb_x_ipsecrequest_len - sizeof(*rq2), &m->new_saddr, &m->new_daddr, &m->new_family); if (err) return err; if (rq1->sadb_x_ipsecrequest_proto != rq2->sadb_x_ipsecrequest_proto || rq1->sadb_x_ipsecrequest_mode != rq2->sadb_x_ipsecrequest_mode || rq1->sadb_x_ipsecrequest_reqid != rq2->sadb_x_ipsecrequest_reqid) return -EINVAL; m->proto = rq1->sadb_x_ipsecrequest_proto; if ((mode = pfkey_mode_to_xfrm(rq1->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; m->mode = mode; m->reqid = rq1->sadb_x_ipsecrequest_reqid; return ((int)(rq1->sadb_x_ipsecrequest_len + rq2->sadb_x_ipsecrequest_len)); } static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { int i, len, ret, err = -EINVAL; u8 dir; struct sadb_address *sa; struct sadb_x_kmaddress *kma; struct sadb_x_policy *pol; struct sadb_x_ipsecrequest *rq; struct xfrm_selector sel; struct xfrm_migrate m[XFRM_MAX_DEPTH]; struct xfrm_kmaddress k; struct net *net = sock_net(sk); if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC - 1], ext_hdrs[SADB_EXT_ADDRESS_DST - 1]) || !ext_hdrs[SADB_X_EXT_POLICY - 1]) { err = -EINVAL; goto out; } kma = ext_hdrs[SADB_X_EXT_KMADDRESS - 1]; pol = ext_hdrs[SADB_X_EXT_POLICY - 1]; if (pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) { err = -EINVAL; goto out; } if (kma) { /* convert sadb_x_kmaddress to xfrm_kmaddress */ k.reserved = kma->sadb_x_kmaddress_reserved; ret = parse_sockaddr_pair((struct sockaddr *)(kma + 1), 8*(kma->sadb_x_kmaddress_len) - sizeof(*kma), &k.local, &k.remote, &k.family); if (ret < 0) { err = ret; goto out; } } dir = pol->sadb_x_policy_dir - 1; memset(&sel, 0, sizeof(sel)); /* set source address info of selector */ sa = ext_hdrs[SADB_EXT_ADDRESS_SRC - 1]; sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); sel.prefixlen_s = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.sport = ((struct sockaddr_in *)(sa + 1))->sin_port; if (sel.sport) sel.sport_mask = htons(0xffff); /* set destination address info of selector */ sa = ext_hdrs[SADB_EXT_ADDRESS_DST - 1]; pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); sel.prefixlen_d = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.dport = ((struct sockaddr_in *)(sa + 1))->sin_port; if (sel.dport) sel.dport_mask = htons(0xffff); rq = (struct sadb_x_ipsecrequest *)(pol + 1); /* extract ipsecrequests */ i = 0; len = pol->sadb_x_policy_len * 8 - sizeof(struct sadb_x_policy); while (len > 0 && i < XFRM_MAX_DEPTH) { ret = ipsecrequests_to_migrate(rq, len, &m[i]); if (ret < 0) { err = ret; goto out; } else { rq = (struct sadb_x_ipsecrequest *)((u8 *)rq + ret); len -= ret; i++; } } if (!i || len > 0) { err = -EINVAL; goto out; } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, kma ? &k : NULL, net, NULL, 0, NULL); out: return err; } #else static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { return -ENOPROTOOPT; } #endif static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); unsigned int dir; int err = 0, delete; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL) return -EINVAL; dir = xfrm_policy_id2dir(pol->sadb_x_policy_id); if (dir >= XFRM_POLICY_MAX) return -EINVAL; delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2); xp = xfrm_policy_byid(net, &dummy_mark, 0, XFRM_POLICY_TYPE_MAIN, dir, pol->sadb_x_policy_id, delete, &err); if (xp == NULL) return -ENOENT; if (delete) { xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.data.byid = 1; c.event = XFRM_MSG_DELPOLICY; km_policy_notify(xp, dir, &c); } else { err = key_pol_get_resp(sk, xp, hdr, dir); } out: xfrm_pol_put(xp); return err; } static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) { struct pfkey_sock *pfk = ptr; struct sk_buff *out_skb; struct sadb_msg *out_hdr; int err; if (!pfkey_can_dump(&pfk->sk)) return -ENOBUFS; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); return err; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = pfk->dump.msg_version; out_hdr->sadb_msg_type = SADB_X_SPDDUMP; out_hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = count + 1; out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; return 0; } static int pfkey_dump_sp(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); return xfrm_policy_walk(net, &pfk->dump.u.policy, dump_sp, (void *) pfk); } static void pfkey_dump_sp_done(struct pfkey_sock *pfk) { struct net *net = sock_net((struct sock *)pfk); xfrm_policy_walk_done(&pfk->dump.u.policy, net); } static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); mutex_lock(&pfk->dump_lock); if (pfk->dump.dump != NULL) { mutex_unlock(&pfk->dump_lock); return -EBUSY; } pfk->dump.msg_version = hdr->sadb_msg_version; pfk->dump.msg_portid = hdr->sadb_msg_pid; pfk->dump.dump = pfkey_dump_sp; pfk->dump.done = pfkey_dump_sp_done; xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN); mutex_unlock(&pfk->dump_lock); return pfkey_do_dump(pfk); } static int key_notify_policy_flush(const struct km_event *c) { struct sk_buff *skb_out; struct sadb_msg *hdr; skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb_out) return -ENOBUFS; hdr = skb_put(skb_out, sizeof(struct sadb_msg)); hdr->sadb_msg_type = SADB_X_SPDFLUSH; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct km_event c; int err, err2; err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, true); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - old silent behavior */ return 0; return err; } c.data.type = XFRM_POLICY_TYPE_MAIN; c.event = XFRM_MSG_FLUSHPOLICY; c.portid = hdr->sadb_msg_pid; c.seq = hdr->sadb_msg_seq; c.net = net; km_policy_notify(NULL, 0, &c); return 0; } typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs); static const pfkey_handler pfkey_funcs[SADB_MAX + 1] = { [SADB_RESERVED] = pfkey_reserved, [SADB_GETSPI] = pfkey_getspi, [SADB_UPDATE] = pfkey_add, [SADB_ADD] = pfkey_add, [SADB_DELETE] = pfkey_delete, [SADB_GET] = pfkey_get, [SADB_ACQUIRE] = pfkey_acquire, [SADB_REGISTER] = pfkey_register, [SADB_EXPIRE] = NULL, [SADB_FLUSH] = pfkey_flush, [SADB_DUMP] = pfkey_dump, [SADB_X_PROMISC] = pfkey_promisc, [SADB_X_PCHANGE] = NULL, [SADB_X_SPDUPDATE] = pfkey_spdadd, [SADB_X_SPDADD] = pfkey_spdadd, [SADB_X_SPDDELETE] = pfkey_spddelete, [SADB_X_SPDGET] = pfkey_spdget, [SADB_X_SPDACQUIRE] = NULL, [SADB_X_SPDDUMP] = pfkey_spddump, [SADB_X_SPDFLUSH] = pfkey_spdflush, [SADB_X_SPDSETIDX] = pfkey_spdadd, [SADB_X_SPDDELETE2] = pfkey_spdget, [SADB_X_MIGRATE] = pfkey_migrate, }; static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr) { void *ext_hdrs[SADB_EXT_MAX]; int err; /* Non-zero return value of pfkey_broadcast() does not always signal * an error and even on an actual error we may still want to process * the message so rather ignore the return value. */ pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); memset(ext_hdrs, 0, sizeof(ext_hdrs)); err = parse_exthdrs(skb, hdr, ext_hdrs); if (!err) { err = -EOPNOTSUPP; if (pfkey_funcs[hdr->sadb_msg_type]) err = pfkey_funcs[hdr->sadb_msg_type](sk, skb, hdr, ext_hdrs); } return err; } static struct sadb_msg *pfkey_get_base_msg(struct sk_buff *skb, int *errp) { struct sadb_msg *hdr = NULL; if (skb->len < sizeof(*hdr)) { *errp = -EMSGSIZE; } else { hdr = (struct sadb_msg *) skb->data; if (hdr->sadb_msg_version != PF_KEY_V2 || hdr->sadb_msg_reserved != 0 || (hdr->sadb_msg_type <= SADB_RESERVED || hdr->sadb_msg_type > SADB_MAX)) { hdr = NULL; *errp = -EINVAL; } else if (hdr->sadb_msg_len != (skb->len / sizeof(uint64_t)) || hdr->sadb_msg_len < (sizeof(struct sadb_msg) / sizeof(uint64_t))) { hdr = NULL; *errp = -EMSGSIZE; } else { *errp = 0; } } return hdr; } static inline int aalg_tmpl_set(const struct xfrm_tmpl *t, const struct xfrm_algo_desc *d) { unsigned int id = d->desc.sadb_alg_id; if (id >= sizeof(t->aalgos) * 8) return 0; return (t->aalgos >> id) & 1; } static inline int ealg_tmpl_set(const struct xfrm_tmpl *t, const struct xfrm_algo_desc *d) { unsigned int id = d->desc.sadb_alg_id; if (id >= sizeof(t->ealgos) * 8) return 0; return (t->ealgos >> id) & 1; } static int count_ah_combs(const struct xfrm_tmpl *t) { int i, sz = 0; for (i = 0; ; i++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } return sz + sizeof(struct sadb_prop); } static int count_esp_combs(const struct xfrm_tmpl *t) { int i, k, sz = 0; for (i = 0; ; i++) { const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (!(ealg_tmpl_set(t, ealg))) continue; for (k = 1; ; k++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } } return sz + sizeof(struct sadb_prop); } static int dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; int sz = 0; int i; p = skb_put(skb, sizeof(struct sadb_prop)); p->sadb_prop_len = sizeof(struct sadb_prop)/8; p->sadb_prop_exttype = SADB_EXT_PROPOSAL; p->sadb_prop_replay = 32; memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); for (i = 0; ; i++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg) && aalg->available) { struct sadb_comb *c; c = skb_put_zero(skb, sizeof(struct sadb_comb)); p->sadb_prop_len += sizeof(struct sadb_comb)/8; c->sadb_comb_auth = aalg->desc.sadb_alg_id; c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; c->sadb_comb_hard_addtime = 24*60*60; c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; sz += sizeof(*c); } } return sz + sizeof(*p); } static int dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; int sz = 0; int i, k; p = skb_put(skb, sizeof(struct sadb_prop)); p->sadb_prop_len = sizeof(struct sadb_prop)/8; p->sadb_prop_exttype = SADB_EXT_PROPOSAL; p->sadb_prop_replay = 32; memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); for (i=0; ; i++) { const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (!(ealg_tmpl_set(t, ealg) && ealg->available)) continue; for (k = 1; ; k++) { struct sadb_comb *c; const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (!(aalg_tmpl_set(t, aalg) && aalg->available)) continue; c = skb_put(skb, sizeof(struct sadb_comb)); memset(c, 0, sizeof(*c)); p->sadb_prop_len += sizeof(struct sadb_comb)/8; c->sadb_comb_auth = aalg->desc.sadb_alg_id; c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; c->sadb_comb_encrypt = ealg->desc.sadb_alg_id; c->sadb_comb_encrypt_minbits = ealg->desc.sadb_alg_minbits; c->sadb_comb_encrypt_maxbits = ealg->desc.sadb_alg_maxbits; c->sadb_comb_hard_addtime = 24*60*60; c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; sz += sizeof(*c); } } return sz + sizeof(*p); } static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c) { return 0; } static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c) { struct sk_buff *out_skb; struct sadb_msg *out_hdr; int hard; int hsc; hard = c->data.hard; if (hard) hsc = 2; else hsc = 1; out_skb = pfkey_xfrm_state2msg_expire(x, hsc); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = PF_KEY_V2; out_hdr->sadb_msg_type = SADB_EXPIRE; out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = 0; out_hdr->sadb_msg_pid = 0; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); return 0; } static int pfkey_send_notify(struct xfrm_state *x, const struct km_event *c) { struct net *net = x ? xs_net(x) : c->net; struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); if (atomic_read(&net_pfkey->socks_nr) == 0) return 0; switch (c->event) { case XFRM_MSG_EXPIRE: return key_notify_sa_expire(x, c); case XFRM_MSG_DELSA: case XFRM_MSG_NEWSA: case XFRM_MSG_UPDSA: return key_notify_sa(x, c); case XFRM_MSG_FLUSHSA: return key_notify_sa_flush(c); case XFRM_MSG_NEWAE: /* not yet supported */ break; default: pr_err("pfkey: Unknown SA event %d\n", c->event); break; } return 0; } static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) { if (xp && xp->type != XFRM_POLICY_TYPE_MAIN) return 0; switch (c->event) { case XFRM_MSG_POLEXPIRE: return key_notify_policy_expire(xp, c); case XFRM_MSG_DELPOLICY: case XFRM_MSG_NEWPOLICY: case XFRM_MSG_UPDPOLICY: return key_notify_policy(xp, dir, c); case XFRM_MSG_FLUSHPOLICY: if (c->data.type != XFRM_POLICY_TYPE_MAIN) break; return key_notify_policy_flush(c); default: pr_err("pfkey: Unknown policy event %d\n", c->event); break; } return 0; } static u32 get_acqseq(void) { u32 res; static atomic_t acqseq; do { res = atomic_inc_return(&acqseq); } while (!res); return res; } static bool pfkey_is_alive(const struct km_event *c) { struct netns_pfkey *net_pfkey = net_generic(c->net, pfkey_net_id); struct sock *sk; bool is_alive = false; rcu_read_lock(); sk_for_each_rcu(sk, &net_pfkey->table) { if (pfkey_sk(sk)->registered) { is_alive = true; break; } } rcu_read_unlock(); return is_alive; } static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *xp) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_address *addr; struct sadb_x_policy *pol; int sockaddr_size; int size; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; int alg_size = 0; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return -EINVAL; size = sizeof(struct sadb_msg) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + sizeof(struct sadb_x_policy); if (x->id.proto == IPPROTO_AH) alg_size = count_ah_combs(t); else if (x->id.proto == IPPROTO_ESP) alg_size = count_esp_combs(t); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } skb = alloc_skb(size + alg_size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_ACQUIRE; hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = x->km.seq = get_acqseq(); hdr->sadb_msg_pid = 0; /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->id.daddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; pol->sadb_x_policy_dir = XFRM_POLICY_OUT + 1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = xp->index; pol->sadb_x_policy_priority = xp->priority; /* Set sadb_comb's. */ alg_size = 0; if (x->id.proto == IPPROTO_AH) alg_size = dump_ah_combs(skb, t); else if (x->id.proto == IPPROTO_ESP) alg_size = dump_esp_combs(skb, t); hdr->sadb_msg_len += alg_size / 8; /* security context */ if (xfrm_ctx) { sec_ctx = skb_put(skb, sizeof(struct sadb_x_sec_ctx) + ctx_size); sec_ctx->sadb_x_sec_len = (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); } static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, u8 *data, int len, int *dir) { struct net *net = sock_net(sk); struct xfrm_policy *xp; struct sadb_x_policy *pol = (struct sadb_x_policy*)data; struct sadb_x_sec_ctx *sec_ctx; switch (sk->sk_family) { case AF_INET: if (opt != IP_IPSEC_POLICY) { *dir = -EOPNOTSUPP; return NULL; } break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: if (opt != IPV6_IPSEC_POLICY) { *dir = -EOPNOTSUPP; return NULL; } break; #endif default: *dir = -EINVAL; return NULL; } *dir = -EINVAL; if (len < sizeof(struct sadb_x_policy) || pol->sadb_x_policy_len*8 > len || pol->sadb_x_policy_type > IPSEC_POLICY_BYPASS || (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir > IPSEC_DIR_OUTBOUND)) return NULL; xp = xfrm_policy_alloc(net, GFP_ATOMIC); if (xp == NULL) { *dir = -ENOBUFS; return NULL; } xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; xp->lft.hard_packet_limit = XFRM_INF; xp->family = sk->sk_family; xp->xfrm_nr = 0; if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && (*dir = parse_ipsecrequests(xp, pol)) < 0) goto out; /* security context too */ if (len >= (pol->sadb_x_policy_len*8 + sizeof(struct sadb_x_sec_ctx))) { char *p = (char *)pol; struct xfrm_user_sec_ctx *uctx; p += pol->sadb_x_policy_len*8; sec_ctx = (struct sadb_x_sec_ctx *)p; if (len < pol->sadb_x_policy_len*8 + sec_ctx->sadb_x_sec_len*8) { *dir = -EINVAL; goto out; } if ((*dir = verify_sec_ctx_len(p))) goto out; uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_ATOMIC); *dir = security_xfrm_policy_alloc(&xp->security, uctx, GFP_ATOMIC); kfree(uctx); if (*dir) goto out; } *dir = pol->sadb_x_policy_dir-1; return xp; out: xp->walk.dead = 1; xfrm_policy_destroy(xp); return NULL; } static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_sa *sa; struct sadb_address *addr; struct sadb_x_nat_t_port *n_port; int sockaddr_size; int size; __u8 satype = (x->id.proto == IPPROTO_ESP ? SADB_SATYPE_ESP : 0); struct xfrm_encap_tmpl *natt = NULL; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return -EINVAL; if (!satype) return -EINVAL; if (!x->encap) return -EINVAL; natt = x->encap; /* Build an SADB_X_NAT_T_NEW_MAPPING message: * * HDR | SA | ADDRESS_SRC (old addr) | NAT_T_SPORT (old port) | * ADDRESS_DST (new addr) | NAT_T_DPORT (new port) */ size = sizeof(struct sadb_msg) + sizeof(struct sadb_sa) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + (sizeof(struct sadb_x_nat_t_port) * 2); skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_X_NAT_T_NEW_MAPPING; hdr->sadb_msg_satype = satype; hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = x->km.seq; hdr->sadb_msg_pid = 0; /* SA */ sa = skb_put(skb, sizeof(struct sadb_sa)); sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); sa->sadb_sa_exttype = SADB_EXT_SA; sa->sadb_sa_spi = x->id.spi; sa->sadb_sa_replay = 0; sa->sadb_sa_state = 0; sa->sadb_sa_auth = 0; sa->sadb_sa_encrypt = 0; sa->sadb_sa_flags = 0; /* ADDRESS_SRC (old addr) */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* NAT_T_SPORT (old port) */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; n_port->sadb_x_nat_t_port_port = natt->encap_sport; n_port->sadb_x_nat_t_port_reserved = 0; /* ADDRESS_DST (new addr) */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(ipaddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* NAT_T_DPORT (new port) */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; n_port->sadb_x_nat_t_port_port = sport; n_port->sadb_x_nat_t_port_reserved = 0; return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); } #ifdef CONFIG_NET_KEY_MIGRATE static int set_sadb_address(struct sk_buff *skb, int sasize, int type, const struct xfrm_selector *sel) { struct sadb_address *addr; addr = skb_put(skb, sizeof(struct sadb_address) + sasize); addr->sadb_address_len = (sizeof(struct sadb_address) + sasize)/8; addr->sadb_address_exttype = type; addr->sadb_address_proto = sel->proto; addr->sadb_address_reserved = 0; switch (type) { case SADB_EXT_ADDRESS_SRC: addr->sadb_address_prefixlen = sel->prefixlen_s; pfkey_sockaddr_fill(&sel->saddr, 0, (struct sockaddr *)(addr + 1), sel->family); break; case SADB_EXT_ADDRESS_DST: addr->sadb_address_prefixlen = sel->prefixlen_d; pfkey_sockaddr_fill(&sel->daddr, 0, (struct sockaddr *)(addr + 1), sel->family); break; default: return -EINVAL; } return 0; } static int set_sadb_kmaddress(struct sk_buff *skb, const struct xfrm_kmaddress *k) { struct sadb_x_kmaddress *kma; u8 *sa; int family = k->family; int socklen = pfkey_sockaddr_len(family); int size_req; size_req = (sizeof(struct sadb_x_kmaddress) + pfkey_sockaddr_pair_size(family)); kma = skb_put_zero(skb, size_req); kma->sadb_x_kmaddress_len = size_req / 8; kma->sadb_x_kmaddress_exttype = SADB_X_EXT_KMADDRESS; kma->sadb_x_kmaddress_reserved = k->reserved; sa = (u8 *)(kma + 1); if (!pfkey_sockaddr_fill(&k->local, 0, (struct sockaddr *)sa, family) || !pfkey_sockaddr_fill(&k->remote, 0, (struct sockaddr *)(sa+socklen), family)) return -EINVAL; return 0; } static int set_ipsecrequest(struct sk_buff *skb, uint8_t proto, uint8_t mode, int level, uint32_t reqid, uint8_t family, const xfrm_address_t *src, const xfrm_address_t *dst) { struct sadb_x_ipsecrequest *rq; u8 *sa; int socklen = pfkey_sockaddr_len(family); int size_req; size_req = sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(family); rq = skb_put_zero(skb, size_req); rq->sadb_x_ipsecrequest_len = size_req; rq->sadb_x_ipsecrequest_proto = proto; rq->sadb_x_ipsecrequest_mode = mode; rq->sadb_x_ipsecrequest_level = level; rq->sadb_x_ipsecrequest_reqid = reqid; sa = (u8 *) (rq + 1); if (!pfkey_sockaddr_fill(src, 0, (struct sockaddr *)sa, family) || !pfkey_sockaddr_fill(dst, 0, (struct sockaddr *)(sa + socklen), family)) return -EINVAL; return 0; } #endif #ifdef CONFIG_NET_KEY_MIGRATE static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { int i; int sasize_sel; int size = 0; int size_pol = 0; struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_x_policy *pol; const struct xfrm_migrate *mp; if (type != XFRM_POLICY_TYPE_MAIN) return 0; if (num_bundles <= 0 || num_bundles > XFRM_MAX_DEPTH) return -EINVAL; if (k != NULL) { /* addresses for KM */ size += PFKEY_ALIGN8(sizeof(struct sadb_x_kmaddress) + pfkey_sockaddr_pair_size(k->family)); } /* selector */ sasize_sel = pfkey_sockaddr_size(sel->family); if (!sasize_sel) return -EINVAL; size += (sizeof(struct sadb_address) + sasize_sel) * 2; /* policy info */ size_pol += sizeof(struct sadb_x_policy); /* ipsecrequests */ for (i = 0, mp = m; i < num_bundles; i++, mp++) { /* old locator pair */ size_pol += sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(mp->old_family); /* new locator pair */ size_pol += sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(mp->new_family); } size += sizeof(struct sadb_msg) + size_pol; /* alloc buffer */ skb = alloc_skb(size, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_X_MIGRATE; hdr->sadb_msg_satype = pfkey_proto2satype(m->proto); hdr->sadb_msg_len = size / 8; hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = 0; hdr->sadb_msg_pid = 0; /* Addresses to be used by KM for negotiation, if ext is available */ if (k != NULL && (set_sadb_kmaddress(skb, k) < 0)) goto err; /* selector src */ set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_SRC, sel); /* selector dst */ set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_DST, sel); /* policy information */ pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = size_pol / 8; pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; pol->sadb_x_policy_dir = dir + 1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = 0; pol->sadb_x_policy_priority = 0; for (i = 0, mp = m; i < num_bundles; i++, mp++) { /* old ipsecrequest */ int mode = pfkey_mode_from_xfrm(mp->mode); if (mode < 0) goto err; if (set_ipsecrequest(skb, mp->proto, mode, (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE), mp->reqid, mp->old_family, &mp->old_saddr, &mp->old_daddr) < 0) goto err; /* new ipsecrequest */ if (set_ipsecrequest(skb, mp->proto, mode, (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE), mp->reqid, mp->new_family, &mp->new_saddr, &mp->new_daddr) < 0) goto err; } /* broadcast migrate message to sockets */ pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); return 0; err: kfree_skb(skb); return -EINVAL; } #else static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { return -ENOPROTOOPT; } #endif static int pfkey_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb = NULL; struct sadb_msg *hdr = NULL; int err; struct net *net = sock_net(sk); err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) goto out; err = -EMSGSIZE; if ((unsigned int)len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; skb = alloc_skb(len, GFP_KERNEL); if (skb == NULL) goto out; err = -EFAULT; if (memcpy_from_msg(skb_put(skb,len), msg, len)) goto out; hdr = pfkey_get_base_msg(skb, &err); if (!hdr) goto out; mutex_lock(&net->xfrm.xfrm_cfg_mutex); err = pfkey_process(sk, skb, hdr); mutex_unlock(&net->xfrm.xfrm_cfg_mutex); out: if (err && hdr && pfkey_error(hdr, err, sk) == 0) err = 0; kfree_skb(skb); return err ? : len; } static int pfkey_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct pfkey_sock *pfk = pfkey_sk(sk); struct sk_buff *skb; int copied, err; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) goto out; skb = skb_recv_datagram(sk, flags, &err); if (skb == NULL) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } skb_reset_transport_header(skb); err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; sock_recv_cmsgs(msg, sk, skb); err = (flags & MSG_TRUNC) ? skb->len : copied; if (pfk->dump.dump != NULL && 3 * atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) pfkey_do_dump(pfk); out_free: skb_free_datagram(sk, skb); out: return err; } static const struct proto_ops pfkey_ops = { .family = PF_KEY, .owner = THIS_MODULE, /* Operations that make no sense on pfkey sockets. */ .bind = sock_no_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, /* Now the operations that really occur. */ .release = pfkey_release, .poll = datagram_poll, .sendmsg = pfkey_sendmsg, .recvmsg = pfkey_recvmsg, }; static const struct net_proto_family pfkey_family_ops = { .family = PF_KEY, .create = pfkey_create, .owner = THIS_MODULE, }; #ifdef CONFIG_PROC_FS static int pfkey_seq_show(struct seq_file *f, void *v) { struct sock *s = sk_entry(v); if (v == SEQ_START_TOKEN) seq_printf(f ,"sk RefCnt Rmem Wmem User Inode\n"); else seq_printf(f, "%pK %-6d %-6u %-6u %-6u %-6lu\n", s, refcount_read(&s->sk_refcnt), sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), from_kuid_munged(seq_user_ns(f), sock_i_uid(s)), sock_i_ino(s) ); return 0; } static void *pfkey_seq_start(struct seq_file *f, loff_t *ppos) __acquires(rcu) { struct net *net = seq_file_net(f); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); rcu_read_lock(); return seq_hlist_start_head_rcu(&net_pfkey->table, *ppos); } static void *pfkey_seq_next(struct seq_file *f, void *v, loff_t *ppos) { struct net *net = seq_file_net(f); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); return seq_hlist_next_rcu(v, &net_pfkey->table, ppos); } static void pfkey_seq_stop(struct seq_file *f, void *v) __releases(rcu) { rcu_read_unlock(); } static const struct seq_operations pfkey_seq_ops = { .start = pfkey_seq_start, .next = pfkey_seq_next, .stop = pfkey_seq_stop, .show = pfkey_seq_show, }; static int __net_init pfkey_init_proc(struct net *net) { struct proc_dir_entry *e; e = proc_create_net("pfkey", 0, net->proc_net, &pfkey_seq_ops, sizeof(struct seq_net_private)); if (e == NULL) return -ENOMEM; return 0; } static void __net_exit pfkey_exit_proc(struct net *net) { remove_proc_entry("pfkey", net->proc_net); } #else static inline int pfkey_init_proc(struct net *net) { return 0; } static inline void pfkey_exit_proc(struct net *net) { } #endif static struct xfrm_mgr pfkeyv2_mgr = { .notify = pfkey_send_notify, .acquire = pfkey_send_acquire, .compile_policy = pfkey_compile_policy, .new_mapping = pfkey_send_new_mapping, .notify_policy = pfkey_send_policy_notify, .migrate = pfkey_send_migrate, .is_alive = pfkey_is_alive, }; static int __net_init pfkey_net_init(struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); int rv; INIT_HLIST_HEAD(&net_pfkey->table); atomic_set(&net_pfkey->socks_nr, 0); rv = pfkey_init_proc(net); return rv; } static void __net_exit pfkey_net_exit(struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); pfkey_exit_proc(net); WARN_ON(!hlist_empty(&net_pfkey->table)); } static struct pernet_operations pfkey_net_ops = { .init = pfkey_net_init, .exit = pfkey_net_exit, .id = &pfkey_net_id, .size = sizeof(struct netns_pfkey), }; static void __exit ipsec_pfkey_exit(void) { xfrm_unregister_km(&pfkeyv2_mgr); sock_unregister(PF_KEY); unregister_pernet_subsys(&pfkey_net_ops); proto_unregister(&key_proto); } static int __init ipsec_pfkey_init(void) { int err = proto_register(&key_proto, 0); if (err != 0) goto out; err = register_pernet_subsys(&pfkey_net_ops); if (err != 0) goto out_unregister_key_proto; err = sock_register(&pfkey_family_ops); if (err != 0) goto out_unregister_pernet; xfrm_register_km(&pfkeyv2_mgr); out: return err; out_unregister_pernet: unregister_pernet_subsys(&pfkey_net_ops); out_unregister_key_proto: proto_unregister(&key_proto); goto out; } module_init(ipsec_pfkey_init); module_exit(ipsec_pfkey_exit); MODULE_DESCRIPTION("PF_KEY socket helpers"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_KEY);
100 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 // SPDX-License-Identifier: GPL-2.0 /* * This is a maximally equidistributed combined Tausworthe generator * based on code from GNU Scientific Library 1.5 (30 Jun 2004) * * lfsr113 version: * * x_n = (s1_n ^ s2_n ^ s3_n ^ s4_n) * * s1_{n+1} = (((s1_n & 4294967294) << 18) ^ (((s1_n << 6) ^ s1_n) >> 13)) * s2_{n+1} = (((s2_n & 4294967288) << 2) ^ (((s2_n << 2) ^ s2_n) >> 27)) * s3_{n+1} = (((s3_n & 4294967280) << 7) ^ (((s3_n << 13) ^ s3_n) >> 21)) * s4_{n+1} = (((s4_n & 4294967168) << 13) ^ (((s4_n << 3) ^ s4_n) >> 12)) * * The period of this generator is about 2^113 (see erratum paper). * * From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe * Generators", Mathematics of Computation, 65, 213 (1996), 203--213: * http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps * ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps * * There is an erratum in the paper "Tables of Maximally Equidistributed * Combined LFSR Generators", Mathematics of Computation, 68, 225 (1999), * 261--269: http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps * * ... the k_j most significant bits of z_j must be non-zero, * for each j. (Note: this restriction also applies to the * computer code given in [4], but was mistakenly not mentioned * in that paper.) * * This affects the seeding procedure by imposing the requirement * s1 > 1, s2 > 7, s3 > 15, s4 > 127. */ #include <linux/types.h> #include <linux/percpu.h> #include <linux/export.h> #include <linux/jiffies.h> #include <linux/prandom.h> #include <linux/sched.h> #include <linux/bitops.h> #include <linux/slab.h> #include <linux/unaligned.h> /** * prandom_u32_state - seeded pseudo-random number generator. * @state: pointer to state structure holding seeded state. * * This is used for pseudo-randomness with no outside seeding. * For more random results, use get_random_u32(). */ u32 prandom_u32_state(struct rnd_state *state) { #define TAUSWORTHE(s, a, b, c, d) ((s & c) << d) ^ (((s << a) ^ s) >> b) state->s1 = TAUSWORTHE(state->s1, 6U, 13U, 4294967294U, 18U); state->s2 = TAUSWORTHE(state->s2, 2U, 27U, 4294967288U, 2U); state->s3 = TAUSWORTHE(state->s3, 13U, 21U, 4294967280U, 7U); state->s4 = TAUSWORTHE(state->s4, 3U, 12U, 4294967168U, 13U); return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4); } EXPORT_SYMBOL(prandom_u32_state); /** * prandom_bytes_state - get the requested number of pseudo-random bytes * * @state: pointer to state structure holding seeded state. * @buf: where to copy the pseudo-random bytes to * @bytes: the requested number of bytes * * This is used for pseudo-randomness with no outside seeding. * For more random results, use get_random_bytes(). */ void prandom_bytes_state(struct rnd_state *state, void *buf, size_t bytes) { u8 *ptr = buf; while (bytes >= sizeof(u32)) { put_unaligned(prandom_u32_state(state), (u32 *) ptr); ptr += sizeof(u32); bytes -= sizeof(u32); } if (bytes > 0) { u32 rem = prandom_u32_state(state); do { *ptr++ = (u8) rem; bytes--; rem >>= BITS_PER_BYTE; } while (bytes > 0); } } EXPORT_SYMBOL(prandom_bytes_state); static void prandom_warmup(struct rnd_state *state) { /* Calling RNG ten times to satisfy recurrence condition */ prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); prandom_u32_state(state); } void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state) { int i; for_each_possible_cpu(i) { struct rnd_state *state = per_cpu_ptr(pcpu_state, i); u32 seeds[4]; get_random_bytes(&seeds, sizeof(seeds)); state->s1 = __seed(seeds[0], 2U); state->s2 = __seed(seeds[1], 8U); state->s3 = __seed(seeds[2], 16U); state->s4 = __seed(seeds[3], 128U); prandom_warmup(state); } } EXPORT_SYMBOL(prandom_seed_full_state); #ifdef CONFIG_RANDOM32_SELFTEST static struct prandom_test1 { u32 seed; u32 result; } test1[] = { { 1U, 3484351685U }, { 2U, 2623130059U }, { 3U, 3125133893U }, { 4U, 984847254U }, }; static struct prandom_test2 { u32 seed; u32 iteration; u32 result; } test2[] = { /* Test cases against taus113 from GSL library. */ { 931557656U, 959U, 2975593782U }, { 1339693295U, 876U, 3887776532U }, { 1545556285U, 961U, 1615538833U }, { 601730776U, 723U, 1776162651U }, { 1027516047U, 687U, 511983079U }, { 416526298U, 700U, 916156552U }, { 1395522032U, 652U, 2222063676U }, { 366221443U, 617U, 2992857763U }, { 1539836965U, 714U, 3783265725U }, { 556206671U, 994U, 799626459U }, { 684907218U, 799U, 367789491U }, { 2121230701U, 931U, 2115467001U }, { 1668516451U, 644U, 3620590685U }, { 768046066U, 883U, 2034077390U }, { 1989159136U, 833U, 1195767305U }, { 536585145U, 996U, 3577259204U }, { 1008129373U, 642U, 1478080776U }, { 1740775604U, 939U, 1264980372U }, { 1967883163U, 508U, 10734624U }, { 1923019697U, 730U, 3821419629U }, { 442079932U, 560U, 3440032343U }, { 1961302714U, 845U, 841962572U }, { 2030205964U, 962U, 1325144227U }, { 1160407529U, 507U, 240940858U }, { 635482502U, 779U, 4200489746U }, { 1252788931U, 699U, 867195434U }, { 1961817131U, 719U, 668237657U }, { 1071468216U, 983U, 917876630U }, { 1281848367U, 932U, 1003100039U }, { 582537119U, 780U, 1127273778U }, { 1973672777U, 853U, 1071368872U }, { 1896756996U, 762U, 1127851055U }, { 847917054U, 500U, 1717499075U }, { 1240520510U, 951U, 2849576657U }, { 1685071682U, 567U, 1961810396U }, { 1516232129U, 557U, 3173877U }, { 1208118903U, 612U, 1613145022U }, { 1817269927U, 693U, 4279122573U }, { 1510091701U, 717U, 638191229U }, { 365916850U, 807U, 600424314U }, { 399324359U, 702U, 1803598116U }, { 1318480274U, 779U, 2074237022U }, { 697758115U, 840U, 1483639402U }, { 1696507773U, 840U, 577415447U }, { 2081979121U, 981U, 3041486449U }, { 955646687U, 742U, 3846494357U }, { 1250683506U, 749U, 836419859U }, { 595003102U, 534U, 366794109U }, { 47485338U, 558U, 3521120834U }, { 619433479U, 610U, 3991783875U }, { 704096520U, 518U, 4139493852U }, { 1712224984U, 606U, 2393312003U }, { 1318233152U, 922U, 3880361134U }, { 855572992U, 761U, 1472974787U }, { 64721421U, 703U, 683860550U }, { 678931758U, 840U, 380616043U }, { 692711973U, 778U, 1382361947U }, { 677703619U, 530U, 2826914161U }, { 92393223U, 586U, 1522128471U }, { 1222592920U, 743U, 3466726667U }, { 358288986U, 695U, 1091956998U }, { 1935056945U, 958U, 514864477U }, { 735675993U, 990U, 1294239989U }, { 1560089402U, 897U, 2238551287U }, { 70616361U, 829U, 22483098U }, { 368234700U, 731U, 2913875084U }, { 20221190U, 879U, 1564152970U }, { 539444654U, 682U, 1835141259U }, { 1314987297U, 840U, 1801114136U }, { 2019295544U, 645U, 3286438930U }, { 469023838U, 716U, 1637918202U }, { 1843754496U, 653U, 2562092152U }, { 400672036U, 809U, 4264212785U }, { 404722249U, 965U, 2704116999U }, { 600702209U, 758U, 584979986U }, { 519953954U, 667U, 2574436237U }, { 1658071126U, 694U, 2214569490U }, { 420480037U, 749U, 3430010866U }, { 690103647U, 969U, 3700758083U }, { 1029424799U, 937U, 3787746841U }, { 2012608669U, 506U, 3362628973U }, { 1535432887U, 998U, 42610943U }, { 1330635533U, 857U, 3040806504U }, { 1223800550U, 539U, 3954229517U }, { 1322411537U, 680U, 3223250324U }, { 1877847898U, 945U, 2915147143U }, { 1646356099U, 874U, 965988280U }, { 805687536U, 744U, 4032277920U }, { 1948093210U, 633U, 1346597684U }, { 392609744U, 783U, 1636083295U }, { 690241304U, 770U, 1201031298U }, { 1360302965U, 696U, 1665394461U }, { 1220090946U, 780U, 1316922812U }, { 447092251U, 500U, 3438743375U }, { 1613868791U, 592U, 828546883U }, { 523430951U, 548U, 2552392304U }, { 726692899U, 810U, 1656872867U }, { 1364340021U, 836U, 3710513486U }, { 1986257729U, 931U, 935013962U }, { 407983964U, 921U, 728767059U }, }; static void prandom_state_selftest_seed(struct rnd_state *state, u32 seed) { #define LCG(x) ((x) * 69069U) /* super-duper LCG */ state->s1 = __seed(LCG(seed), 2U); state->s2 = __seed(LCG(state->s1), 8U); state->s3 = __seed(LCG(state->s2), 16U); state->s4 = __seed(LCG(state->s3), 128U); } static int __init prandom_state_selftest(void) { int i, j, errors = 0, runs = 0; bool error = false; for (i = 0; i < ARRAY_SIZE(test1); i++) { struct rnd_state state; prandom_state_selftest_seed(&state, test1[i].seed); prandom_warmup(&state); if (test1[i].result != prandom_u32_state(&state)) error = true; } if (error) pr_warn("prandom: seed boundary self test failed\n"); else pr_info("prandom: seed boundary self test passed\n"); for (i = 0; i < ARRAY_SIZE(test2); i++) { struct rnd_state state; prandom_state_selftest_seed(&state, test2[i].seed); prandom_warmup(&state); for (j = 0; j < test2[i].iteration - 1; j++) prandom_u32_state(&state); if (test2[i].result != prandom_u32_state(&state)) errors++; runs++; cond_resched(); } if (errors) pr_warn("prandom: %d/%d self tests failed\n", errors, runs); else pr_info("prandom: %d self tests passed\n", runs); return 0; } core_initcall(prandom_state_selftest); #endif
14 14 14 14 5 5 5 5 10 10 10 10 9 10 1 1 1 1 1 1 1 2 2 12 12 3 2 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 // SPDX-License-Identifier: GPL-2.0-or-later /* * HID support for Linux * * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz> * Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc * Copyright (c) 2006-2012 Jiri Kosina */ /* */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/spinlock.h> #include <linux/unaligned.h> #include <asm/byteorder.h> #include <linux/input.h> #include <linux/wait.h> #include <linux/vmalloc.h> #include <linux/sched.h> #include <linux/semaphore.h> #include <linux/hid.h> #include <linux/hiddev.h> #include <linux/hid-debug.h> #include <linux/hidraw.h> #include "hid-ids.h" /* * Version Information */ #define DRIVER_DESC "HID core driver" static int hid_ignore_special_drivers = 0; module_param_named(ignore_special_drivers, hid_ignore_special_drivers, int, 0600); MODULE_PARM_DESC(ignore_special_drivers, "Ignore any special drivers and handle all devices by generic driver"); /* * Convert a signed n-bit integer to signed 32-bit integer. */ static s32 snto32(__u32 value, unsigned int n) { if (!value || !n) return 0; if (n > 32) n = 32; return sign_extend32(value, n - 1); } /* * Convert a signed 32-bit integer to a signed n-bit integer. */ static u32 s32ton(__s32 value, unsigned int n) { s32 a = value >> (n - 1); if (a && a != -1) return value < 0 ? 1 << (n - 1) : (1 << (n - 1)) - 1; return value & ((1 << n) - 1); } /* * Register a new report for a device. */ struct hid_report *hid_register_report(struct hid_device *device, enum hid_report_type type, unsigned int id, unsigned int application) { struct hid_report_enum *report_enum = device->report_enum + type; struct hid_report *report; if (id >= HID_MAX_IDS) return NULL; if (report_enum->report_id_hash[id]) return report_enum->report_id_hash[id]; report = kzalloc(sizeof(struct hid_report), GFP_KERNEL); if (!report) return NULL; if (id != 0) report_enum->numbered = 1; report->id = id; report->type = type; report->size = 0; report->device = device; report->application = application; report_enum->report_id_hash[id] = report; list_add_tail(&report->list, &report_enum->report_list); INIT_LIST_HEAD(&report->field_entry_list); return report; } EXPORT_SYMBOL_GPL(hid_register_report); /* * Register a new field for this report. */ static struct hid_field *hid_register_field(struct hid_report *report, unsigned usages) { struct hid_field *field; if (report->maxfield == HID_MAX_FIELDS) { hid_err(report->device, "too many fields in report\n"); return NULL; } field = kvzalloc((sizeof(struct hid_field) + usages * sizeof(struct hid_usage) + 3 * usages * sizeof(unsigned int)), GFP_KERNEL); if (!field) return NULL; field->index = report->maxfield++; report->field[field->index] = field; field->usage = (struct hid_usage *)(field + 1); field->value = (s32 *)(field->usage + usages); field->new_value = (s32 *)(field->value + usages); field->usages_priorities = (s32 *)(field->new_value + usages); field->report = report; return field; } /* * Open a collection. The type/usage is pushed on the stack. */ static int open_collection(struct hid_parser *parser, unsigned type) { struct hid_collection *collection; unsigned usage; int collection_index; usage = parser->local.usage[0]; if (parser->collection_stack_ptr == parser->collection_stack_size) { unsigned int *collection_stack; unsigned int new_size = parser->collection_stack_size + HID_COLLECTION_STACK_SIZE; collection_stack = krealloc(parser->collection_stack, new_size * sizeof(unsigned int), GFP_KERNEL); if (!collection_stack) return -ENOMEM; parser->collection_stack = collection_stack; parser->collection_stack_size = new_size; } if (parser->device->maxcollection == parser->device->collection_size) { collection = kmalloc( array3_size(sizeof(struct hid_collection), parser->device->collection_size, 2), GFP_KERNEL); if (collection == NULL) { hid_err(parser->device, "failed to reallocate collection array\n"); return -ENOMEM; } memcpy(collection, parser->device->collection, sizeof(struct hid_collection) * parser->device->collection_size); memset(collection + parser->device->collection_size, 0, sizeof(struct hid_collection) * parser->device->collection_size); kfree(parser->device->collection); parser->device->collection = collection; parser->device->collection_size *= 2; } parser->collection_stack[parser->collection_stack_ptr++] = parser->device->maxcollection; collection_index = parser->device->maxcollection++; collection = parser->device->collection + collection_index; collection->type = type; collection->usage = usage; collection->level = parser->collection_stack_ptr - 1; collection->parent_idx = (collection->level == 0) ? -1 : parser->collection_stack[collection->level - 1]; if (type == HID_COLLECTION_APPLICATION) parser->device->maxapplication++; return 0; } /* * Close a collection. */ static int close_collection(struct hid_parser *parser) { if (!parser->collection_stack_ptr) { hid_err(parser->device, "collection stack underflow\n"); return -EINVAL; } parser->collection_stack_ptr--; return 0; } /* * Climb up the stack, search for the specified collection type * and return the usage. */ static unsigned hid_lookup_collection(struct hid_parser *parser, unsigned type) { struct hid_collection *collection = parser->device->collection; int n; for (n = parser->collection_stack_ptr - 1; n >= 0; n--) { unsigned index = parser->collection_stack[n]; if (collection[index].type == type) return collection[index].usage; } return 0; /* we know nothing about this usage type */ } /* * Concatenate usage which defines 16 bits or less with the * currently defined usage page to form a 32 bit usage */ static void complete_usage(struct hid_parser *parser, unsigned int index) { parser->local.usage[index] &= 0xFFFF; parser->local.usage[index] |= (parser->global.usage_page & 0xFFFF) << 16; } /* * Add a usage to the temporary parser table. */ static int hid_add_usage(struct hid_parser *parser, unsigned usage, u8 size) { if (parser->local.usage_index >= HID_MAX_USAGES) { hid_err(parser->device, "usage index exceeded\n"); return -1; } parser->local.usage[parser->local.usage_index] = usage; /* * If Usage item only includes usage id, concatenate it with * currently defined usage page */ if (size <= 2) complete_usage(parser, parser->local.usage_index); parser->local.usage_size[parser->local.usage_index] = size; parser->local.collection_index[parser->local.usage_index] = parser->collection_stack_ptr ? parser->collection_stack[parser->collection_stack_ptr - 1] : 0; parser->local.usage_index++; return 0; } /* * Register a new field for this report. */ static int hid_add_field(struct hid_parser *parser, unsigned report_type, unsigned flags) { struct hid_report *report; struct hid_field *field; unsigned int max_buffer_size = HID_MAX_BUFFER_SIZE; unsigned int usages; unsigned int offset; unsigned int i; unsigned int application; application = hid_lookup_collection(parser, HID_COLLECTION_APPLICATION); report = hid_register_report(parser->device, report_type, parser->global.report_id, application); if (!report) { hid_err(parser->device, "hid_register_report failed\n"); return -1; } /* Handle both signed and unsigned cases properly */ if ((parser->global.logical_minimum < 0 && parser->global.logical_maximum < parser->global.logical_minimum) || (parser->global.logical_minimum >= 0 && (__u32)parser->global.logical_maximum < (__u32)parser->global.logical_minimum)) { dbg_hid("logical range invalid 0x%x 0x%x\n", parser->global.logical_minimum, parser->global.logical_maximum); return -1; } offset = report->size; report->size += parser->global.report_size * parser->global.report_count; if (parser->device->ll_driver->max_buffer_size) max_buffer_size = parser->device->ll_driver->max_buffer_size; /* Total size check: Allow for possible report index byte */ if (report->size > (max_buffer_size - 1) << 3) { hid_err(parser->device, "report is too long\n"); return -1; } if (!parser->local.usage_index) /* Ignore padding fields */ return 0; usages = max_t(unsigned, parser->local.usage_index, parser->global.report_count); field = hid_register_field(report, usages); if (!field) return 0; field->physical = hid_lookup_collection(parser, HID_COLLECTION_PHYSICAL); field->logical = hid_lookup_collection(parser, HID_COLLECTION_LOGICAL); field->application = application; for (i = 0; i < usages; i++) { unsigned j = i; /* Duplicate the last usage we parsed if we have excess values */ if (i >= parser->local.usage_index) j = parser->local.usage_index - 1; field->usage[i].hid = parser->local.usage[j]; field->usage[i].collection_index = parser->local.collection_index[j]; field->usage[i].usage_index = i; field->usage[i].resolution_multiplier = 1; } field->maxusage = usages; field->flags = flags; field->report_offset = offset; field->report_type = report_type; field->report_size = parser->global.report_size; field->report_count = parser->global.report_count; field->logical_minimum = parser->global.logical_minimum; field->logical_maximum = parser->global.logical_maximum; field->physical_minimum = parser->global.physical_minimum; field->physical_maximum = parser->global.physical_maximum; field->unit_exponent = parser->global.unit_exponent; field->unit = parser->global.unit; return 0; } /* * Read data value from item. */ static u32 item_udata(struct hid_item *item) { switch (item->size) { case 1: return item->data.u8; case 2: return item->data.u16; case 4: return item->data.u32; } return 0; } static s32 item_sdata(struct hid_item *item) { switch (item->size) { case 1: return item->data.s8; case 2: return item->data.s16; case 4: return item->data.s32; } return 0; } /* * Process a global item. */ static int hid_parser_global(struct hid_parser *parser, struct hid_item *item) { __s32 raw_value; switch (item->tag) { case HID_GLOBAL_ITEM_TAG_PUSH: if (parser->global_stack_ptr == HID_GLOBAL_STACK_SIZE) { hid_err(parser->device, "global environment stack overflow\n"); return -1; } memcpy(parser->global_stack + parser->global_stack_ptr++, &parser->global, sizeof(struct hid_global)); return 0; case HID_GLOBAL_ITEM_TAG_POP: if (!parser->global_stack_ptr) { hid_err(parser->device, "global environment stack underflow\n"); return -1; } memcpy(&parser->global, parser->global_stack + --parser->global_stack_ptr, sizeof(struct hid_global)); return 0; case HID_GLOBAL_ITEM_TAG_USAGE_PAGE: parser->global.usage_page = item_udata(item); return 0; case HID_GLOBAL_ITEM_TAG_LOGICAL_MINIMUM: parser->global.logical_minimum = item_sdata(item); return 0; case HID_GLOBAL_ITEM_TAG_LOGICAL_MAXIMUM: if (parser->global.logical_minimum < 0) parser->global.logical_maximum = item_sdata(item); else parser->global.logical_maximum = item_udata(item); return 0; case HID_GLOBAL_ITEM_TAG_PHYSICAL_MINIMUM: parser->global.physical_minimum = item_sdata(item); return 0; case HID_GLOBAL_ITEM_TAG_PHYSICAL_MAXIMUM: if (parser->global.physical_minimum < 0) parser->global.physical_maximum = item_sdata(item); else parser->global.physical_maximum = item_udata(item); return 0; case HID_GLOBAL_ITEM_TAG_UNIT_EXPONENT: /* Many devices provide unit exponent as a two's complement * nibble due to the common misunderstanding of HID * specification 1.11, 6.2.2.7 Global Items. Attempt to handle * both this and the standard encoding. */ raw_value = item_sdata(item); if (!(raw_value & 0xfffffff0)) parser->global.unit_exponent = snto32(raw_value, 4); else parser->global.unit_exponent = raw_value; return 0; case HID_GLOBAL_ITEM_TAG_UNIT: parser->global.unit = item_udata(item); return 0; case HID_GLOBAL_ITEM_TAG_REPORT_SIZE: parser->global.report_size = item_udata(item); if (parser->global.report_size > 256) { hid_err(parser->device, "invalid report_size %d\n", parser->global.report_size); return -1; } return 0; case HID_GLOBAL_ITEM_TAG_REPORT_COUNT: parser->global.report_count = item_udata(item); if (parser->global.report_count > HID_MAX_USAGES) { hid_err(parser->device, "invalid report_count %d\n", parser->global.report_count); return -1; } return 0; case HID_GLOBAL_ITEM_TAG_REPORT_ID: parser->global.report_id = item_udata(item); if (parser->global.report_id == 0 || parser->global.report_id >= HID_MAX_IDS) { hid_err(parser->device, "report_id %u is invalid\n", parser->global.report_id); return -1; } return 0; default: hid_err(parser->device, "unknown global tag 0x%x\n", item->tag); return -1; } } /* * Process a local item. */ static int hid_parser_local(struct hid_parser *parser, struct hid_item *item) { __u32 data; unsigned n; __u32 count; data = item_udata(item); switch (item->tag) { case HID_LOCAL_ITEM_TAG_DELIMITER: if (data) { /* * We treat items before the first delimiter * as global to all usage sets (branch 0). * In the moment we process only these global * items and the first delimiter set. */ if (parser->local.delimiter_depth != 0) { hid_err(parser->device, "nested delimiters\n"); return -1; } parser->local.delimiter_depth++; parser->local.delimiter_branch++; } else { if (parser->local.delimiter_depth < 1) { hid_err(parser->device, "bogus close delimiter\n"); return -1; } parser->local.delimiter_depth--; } return 0; case HID_LOCAL_ITEM_TAG_USAGE: if (parser->local.delimiter_branch > 1) { dbg_hid("alternative usage ignored\n"); return 0; } return hid_add_usage(parser, data, item->size); case HID_LOCAL_ITEM_TAG_USAGE_MINIMUM: if (parser->local.delimiter_branch > 1) { dbg_hid("alternative usage ignored\n"); return 0; } parser->local.usage_minimum = data; return 0; case HID_LOCAL_ITEM_TAG_USAGE_MAXIMUM: if (parser->local.delimiter_branch > 1) { dbg_hid("alternative usage ignored\n"); return 0; } count = data - parser->local.usage_minimum; if (count + parser->local.usage_index >= HID_MAX_USAGES) { /* * We do not warn if the name is not set, we are * actually pre-scanning the device. */ if (dev_name(&parser->device->dev)) hid_warn(parser->device, "ignoring exceeding usage max\n"); data = HID_MAX_USAGES - parser->local.usage_index + parser->local.usage_minimum - 1; if (data <= 0) { hid_err(parser->device, "no more usage index available\n"); return -1; } } for (n = parser->local.usage_minimum; n <= data; n++) if (hid_add_usage(parser, n, item->size)) { dbg_hid("hid_add_usage failed\n"); return -1; } return 0; default: dbg_hid("unknown local item tag 0x%x\n", item->tag); return 0; } return 0; } /* * Concatenate Usage Pages into Usages where relevant: * As per specification, 6.2.2.8: "When the parser encounters a main item it * concatenates the last declared Usage Page with a Usage to form a complete * usage value." */ static void hid_concatenate_last_usage_page(struct hid_parser *parser) { int i; unsigned int usage_page; unsigned int current_page; if (!parser->local.usage_index) return; usage_page = parser->global.usage_page; /* * Concatenate usage page again only if last declared Usage Page * has not been already used in previous usages concatenation */ for (i = parser->local.usage_index - 1; i >= 0; i--) { if (parser->local.usage_size[i] > 2) /* Ignore extended usages */ continue; current_page = parser->local.usage[i] >> 16; if (current_page == usage_page) break; complete_usage(parser, i); } } /* * Process a main item. */ static int hid_parser_main(struct hid_parser *parser, struct hid_item *item) { __u32 data; int ret; hid_concatenate_last_usage_page(parser); data = item_udata(item); switch (item->tag) { case HID_MAIN_ITEM_TAG_BEGIN_COLLECTION: ret = open_collection(parser, data & 0xff); break; case HID_MAIN_ITEM_TAG_END_COLLECTION: ret = close_collection(parser); break; case HID_MAIN_ITEM_TAG_INPUT: ret = hid_add_field(parser, HID_INPUT_REPORT, data); break; case HID_MAIN_ITEM_TAG_OUTPUT: ret = hid_add_field(parser, HID_OUTPUT_REPORT, data); break; case HID_MAIN_ITEM_TAG_FEATURE: ret = hid_add_field(parser, HID_FEATURE_REPORT, data); break; default: hid_warn(parser->device, "unknown main item tag 0x%x\n", item->tag); ret = 0; } memset(&parser->local, 0, sizeof(parser->local)); /* Reset the local parser environment */ return ret; } /* * Process a reserved item. */ static int hid_parser_reserved(struct hid_parser *parser, struct hid_item *item) { dbg_hid("reserved item type, tag 0x%x\n", item->tag); return 0; } /* * Free a report and all registered fields. The field->usage and * field->value table's are allocated behind the field, so we need * only to free(field) itself. */ static void hid_free_report(struct hid_report *report) { unsigned n; kfree(report->field_entries); for (n = 0; n < report->maxfield; n++) kvfree(report->field[n]); kfree(report); } /* * Close report. This function returns the device * state to the point prior to hid_open_report(). */ static void hid_close_report(struct hid_device *device) { unsigned i, j; for (i = 0; i < HID_REPORT_TYPES; i++) { struct hid_report_enum *report_enum = device->report_enum + i; for (j = 0; j < HID_MAX_IDS; j++) { struct hid_report *report = report_enum->report_id_hash[j]; if (report) hid_free_report(report); } memset(report_enum, 0, sizeof(*report_enum)); INIT_LIST_HEAD(&report_enum->report_list); } /* * If the HID driver had a rdesc_fixup() callback, dev->rdesc * will be allocated by hid-core and needs to be freed. * Otherwise, it is either equal to dev_rdesc or bpf_rdesc, in * which cases it'll be freed later on device removal or destroy. */ if (device->rdesc != device->dev_rdesc && device->rdesc != device->bpf_rdesc) kfree(device->rdesc); device->rdesc = NULL; device->rsize = 0; kfree(device->collection); device->collection = NULL; device->collection_size = 0; device->maxcollection = 0; device->maxapplication = 0; device->status &= ~HID_STAT_PARSED; } static inline void hid_free_bpf_rdesc(struct hid_device *hdev) { /* bpf_rdesc is either equal to dev_rdesc or allocated by call_hid_bpf_rdesc_fixup() */ if (hdev->bpf_rdesc != hdev->dev_rdesc) kfree(hdev->bpf_rdesc); hdev->bpf_rdesc = NULL; } /* * Free a device structure, all reports, and all fields. */ void hiddev_free(struct kref *ref) { struct hid_device *hid = container_of(ref, struct hid_device, ref); hid_close_report(hid); hid_free_bpf_rdesc(hid); kfree(hid->dev_rdesc); kfree(hid); } static void hid_device_release(struct device *dev) { struct hid_device *hid = to_hid_device(dev); kref_put(&hid->ref, hiddev_free); } /* * Fetch a report description item from the data stream. We support long * items, though they are not used yet. */ static const u8 *fetch_item(const __u8 *start, const __u8 *end, struct hid_item *item) { u8 b; if ((end - start) <= 0) return NULL; b = *start++; item->type = (b >> 2) & 3; item->tag = (b >> 4) & 15; if (item->tag == HID_ITEM_TAG_LONG) { item->format = HID_ITEM_FORMAT_LONG; if ((end - start) < 2) return NULL; item->size = *start++; item->tag = *start++; if ((end - start) < item->size) return NULL; item->data.longdata = start; start += item->size; return start; } item->format = HID_ITEM_FORMAT_SHORT; item->size = BIT(b & 3) >> 1; /* 0, 1, 2, 3 -> 0, 1, 2, 4 */ if (end - start < item->size) return NULL; switch (item->size) { case 0: break; case 1: item->data.u8 = *start; break; case 2: item->data.u16 = get_unaligned_le16(start); break; case 4: item->data.u32 = get_unaligned_le32(start); break; } return start + item->size; } static void hid_scan_input_usage(struct hid_parser *parser, u32 usage) { struct hid_device *hid = parser->device; if (usage == HID_DG_CONTACTID) hid->group = HID_GROUP_MULTITOUCH; } static void hid_scan_feature_usage(struct hid_parser *parser, u32 usage) { if (usage == 0xff0000c5 && parser->global.report_count == 256 && parser->global.report_size == 8) parser->scan_flags |= HID_SCAN_FLAG_MT_WIN_8; if (usage == 0xff0000c6 && parser->global.report_count == 1 && parser->global.report_size == 8) parser->scan_flags |= HID_SCAN_FLAG_MT_WIN_8; } static void hid_scan_collection(struct hid_parser *parser, unsigned type) { struct hid_device *hid = parser->device; int i; if (((parser->global.usage_page << 16) == HID_UP_SENSOR) && (type == HID_COLLECTION_PHYSICAL || type == HID_COLLECTION_APPLICATION)) hid->group = HID_GROUP_SENSOR_HUB; if (hid->vendor == USB_VENDOR_ID_MICROSOFT && hid->product == USB_DEVICE_ID_MS_POWER_COVER && hid->group == HID_GROUP_MULTITOUCH) hid->group = HID_GROUP_GENERIC; if ((parser->global.usage_page << 16) == HID_UP_GENDESK) for (i = 0; i < parser->local.usage_index; i++) if (parser->local.usage[i] == HID_GD_POINTER) parser->scan_flags |= HID_SCAN_FLAG_GD_POINTER; if ((parser->global.usage_page << 16) >= HID_UP_MSVENDOR) parser->scan_flags |= HID_SCAN_FLAG_VENDOR_SPECIFIC; if ((parser->global.usage_page << 16) == HID_UP_GOOGLEVENDOR) for (i = 0; i < parser->local.usage_index; i++) if (parser->local.usage[i] == (HID_UP_GOOGLEVENDOR | 0x0001)) parser->device->group = HID_GROUP_VIVALDI; } static int hid_scan_main(struct hid_parser *parser, struct hid_item *item) { __u32 data; int i; hid_concatenate_last_usage_page(parser); data = item_udata(item); switch (item->tag) { case HID_MAIN_ITEM_TAG_BEGIN_COLLECTION: hid_scan_collection(parser, data & 0xff); break; case HID_MAIN_ITEM_TAG_END_COLLECTION: break; case HID_MAIN_ITEM_TAG_INPUT: /* ignore constant inputs, they will be ignored by hid-input */ if (data & HID_MAIN_ITEM_CONSTANT) break; for (i = 0; i < parser->local.usage_index; i++) hid_scan_input_usage(parser, parser->local.usage[i]); break; case HID_MAIN_ITEM_TAG_OUTPUT: break; case HID_MAIN_ITEM_TAG_FEATURE: for (i = 0; i < parser->local.usage_index; i++) hid_scan_feature_usage(parser, parser->local.usage[i]); break; } /* Reset the local parser environment */ memset(&parser->local, 0, sizeof(parser->local)); return 0; } /* * Scan a report descriptor before the device is added to the bus. * Sets device groups and other properties that determine what driver * to load. */ static int hid_scan_report(struct hid_device *hid) { struct hid_parser *parser; struct hid_item item; const __u8 *start = hid->dev_rdesc; const __u8 *end = start + hid->dev_rsize; static int (*dispatch_type[])(struct hid_parser *parser, struct hid_item *item) = { hid_scan_main, hid_parser_global, hid_parser_local, hid_parser_reserved }; parser = vzalloc(sizeof(struct hid_parser)); if (!parser) return -ENOMEM; parser->device = hid; hid->group = HID_GROUP_GENERIC; /* * The parsing is simpler than the one in hid_open_report() as we should * be robust against hid errors. Those errors will be raised by * hid_open_report() anyway. */ while ((start = fetch_item(start, end, &item)) != NULL) dispatch_type[item.type](parser, &item); /* * Handle special flags set during scanning. */ if ((parser->scan_flags & HID_SCAN_FLAG_MT_WIN_8) && (hid->group == HID_GROUP_MULTITOUCH)) hid->group = HID_GROUP_MULTITOUCH_WIN_8; /* * Vendor specific handlings */ switch (hid->vendor) { case USB_VENDOR_ID_WACOM: hid->group = HID_GROUP_WACOM; break; case USB_VENDOR_ID_SYNAPTICS: if (hid->group == HID_GROUP_GENERIC) if ((parser->scan_flags & HID_SCAN_FLAG_VENDOR_SPECIFIC) && (parser->scan_flags & HID_SCAN_FLAG_GD_POINTER)) /* * hid-rmi should take care of them, * not hid-generic */ hid->group = HID_GROUP_RMI; break; } kfree(parser->collection_stack); vfree(parser); return 0; } /** * hid_parse_report - parse device report * * @hid: hid device * @start: report start * @size: report size * * Allocate the device report as read by the bus driver. This function should * only be called from parse() in ll drivers. */ int hid_parse_report(struct hid_device *hid, const __u8 *start, unsigned size) { hid->dev_rdesc = kmemdup(start, size, GFP_KERNEL); if (!hid->dev_rdesc) return -ENOMEM; hid->dev_rsize = size; return 0; } EXPORT_SYMBOL_GPL(hid_parse_report); static const char * const hid_report_names[] = { "HID_INPUT_REPORT", "HID_OUTPUT_REPORT", "HID_FEATURE_REPORT", }; /** * hid_validate_values - validate existing device report's value indexes * * @hid: hid device * @type: which report type to examine * @id: which report ID to examine (0 for first) * @field_index: which report field to examine * @report_counts: expected number of values * * Validate the number of values in a given field of a given report, after * parsing. */ struct hid_report *hid_validate_values(struct hid_device *hid, enum hid_report_type type, unsigned int id, unsigned int field_index, unsigned int report_counts) { struct hid_report *report; if (type > HID_FEATURE_REPORT) { hid_err(hid, "invalid HID report type %u\n", type); return NULL; } if (id >= HID_MAX_IDS) { hid_err(hid, "invalid HID report id %u\n", id); return NULL; } /* * Explicitly not using hid_get_report() here since it depends on * ->numbered being checked, which may not always be the case when * drivers go to access report values. */ if (id == 0) { /* * Validating on id 0 means we should examine the first * report in the list. */ report = list_first_entry_or_null( &hid->report_enum[type].report_list, struct hid_report, list); } else { report = hid->report_enum[type].report_id_hash[id]; } if (!report) { hid_err(hid, "missing %s %u\n", hid_report_names[type], id); return NULL; } if (report->maxfield <= field_index) { hid_err(hid, "not enough fields in %s %u\n", hid_report_names[type], id); return NULL; } if (report->field[field_index]->report_count < report_counts) { hid_err(hid, "not enough values in %s %u field %u\n", hid_report_names[type], id, field_index); return NULL; } return report; } EXPORT_SYMBOL_GPL(hid_validate_values); static int hid_calculate_multiplier(struct hid_device *hid, struct hid_field *multiplier) { int m; __s32 v = *multiplier->value; __s32 lmin = multiplier->logical_minimum; __s32 lmax = multiplier->logical_maximum; __s32 pmin = multiplier->physical_minimum; __s32 pmax = multiplier->physical_maximum; /* * "Because OS implementations will generally divide the control's * reported count by the Effective Resolution Multiplier, designers * should take care not to establish a potential Effective * Resolution Multiplier of zero." * HID Usage Table, v1.12, Section 4.3.1, p31 */ if (lmax - lmin == 0) return 1; /* * Handling the unit exponent is left as an exercise to whoever * finds a device where that exponent is not 0. */ m = ((v - lmin)/(lmax - lmin) * (pmax - pmin) + pmin); if (unlikely(multiplier->unit_exponent != 0)) { hid_warn(hid, "unsupported Resolution Multiplier unit exponent %d\n", multiplier->unit_exponent); } /* There are no devices with an effective multiplier > 255 */ if (unlikely(m == 0 || m > 255 || m < -255)) { hid_warn(hid, "unsupported Resolution Multiplier %d\n", m); m = 1; } return m; } static void hid_apply_multiplier_to_field(struct hid_device *hid, struct hid_field *field, struct hid_collection *multiplier_collection, int effective_multiplier) { struct hid_collection *collection; struct hid_usage *usage; int i; /* * If multiplier_collection is NULL, the multiplier applies * to all fields in the report. * Otherwise, it is the Logical Collection the multiplier applies to * but our field may be in a subcollection of that collection. */ for (i = 0; i < field->maxusage; i++) { usage = &field->usage[i]; collection = &hid->collection[usage->collection_index]; while (collection->parent_idx != -1 && collection != multiplier_collection) collection = &hid->collection[collection->parent_idx]; if (collection->parent_idx != -1 || multiplier_collection == NULL) usage->resolution_multiplier = effective_multiplier; } } static void hid_apply_multiplier(struct hid_device *hid, struct hid_field *multiplier) { struct hid_report_enum *rep_enum; struct hid_report *rep; struct hid_field *field; struct hid_collection *multiplier_collection; int effective_multiplier; int i; /* * "The Resolution Multiplier control must be contained in the same * Logical Collection as the control(s) to which it is to be applied. * If no Resolution Multiplier is defined, then the Resolution * Multiplier defaults to 1. If more than one control exists in a * Logical Collection, the Resolution Multiplier is associated with * all controls in the collection. If no Logical Collection is * defined, the Resolution Multiplier is associated with all * controls in the report." * HID Usage Table, v1.12, Section 4.3.1, p30 * * Thus, search from the current collection upwards until we find a * logical collection. Then search all fields for that same parent * collection. Those are the fields the multiplier applies to. * * If we have more than one multiplier, it will overwrite the * applicable fields later. */ multiplier_collection = &hid->collection[multiplier->usage->collection_index]; while (multiplier_collection->parent_idx != -1 && multiplier_collection->type != HID_COLLECTION_LOGICAL) multiplier_collection = &hid->collection[multiplier_collection->parent_idx]; effective_multiplier = hid_calculate_multiplier(hid, multiplier); rep_enum = &hid->report_enum[HID_INPUT_REPORT]; list_for_each_entry(rep, &rep_enum->report_list, list) { for (i = 0; i < rep->maxfield; i++) { field = rep->field[i]; hid_apply_multiplier_to_field(hid, field, multiplier_collection, effective_multiplier); } } } /* * hid_setup_resolution_multiplier - set up all resolution multipliers * * @device: hid device * * Search for all Resolution Multiplier Feature Reports and apply their * value to all matching Input items. This only updates the internal struct * fields. * * The Resolution Multiplier is applied by the hardware. If the multiplier * is anything other than 1, the hardware will send pre-multiplied events * so that the same physical interaction generates an accumulated * accumulated_value = value * * multiplier * This may be achieved by sending * - "value * multiplier" for each event, or * - "value" but "multiplier" times as frequently, or * - a combination of the above * The only guarantee is that the same physical interaction always generates * an accumulated 'value * multiplier'. * * This function must be called before any event processing and after * any SetRequest to the Resolution Multiplier. */ void hid_setup_resolution_multiplier(struct hid_device *hid) { struct hid_report_enum *rep_enum; struct hid_report *rep; struct hid_usage *usage; int i, j; rep_enum = &hid->report_enum[HID_FEATURE_REPORT]; list_for_each_entry(rep, &rep_enum->report_list, list) { for (i = 0; i < rep->maxfield; i++) { /* Ignore if report count is out of bounds. */ if (rep->field[i]->report_count < 1) continue; for (j = 0; j < rep->field[i]->maxusage; j++) { usage = &rep->field[i]->usage[j]; if (usage->hid == HID_GD_RESOLUTION_MULTIPLIER) hid_apply_multiplier(hid, rep->field[i]); } } } } EXPORT_SYMBOL_GPL(hid_setup_resolution_multiplier); /** * hid_open_report - open a driver-specific device report * * @device: hid device * * Parse a report description into a hid_device structure. Reports are * enumerated, fields are attached to these reports. * 0 returned on success, otherwise nonzero error value. * * This function (or the equivalent hid_parse() macro) should only be * called from probe() in drivers, before starting the device. */ int hid_open_report(struct hid_device *device) { struct hid_parser *parser; struct hid_item item; unsigned int size; const __u8 *start; const __u8 *end; const __u8 *next; int ret; int i; static int (*dispatch_type[])(struct hid_parser *parser, struct hid_item *item) = { hid_parser_main, hid_parser_global, hid_parser_local, hid_parser_reserved }; if (WARN_ON(device->status & HID_STAT_PARSED)) return -EBUSY; start = device->bpf_rdesc; if (WARN_ON(!start)) return -ENODEV; size = device->bpf_rsize; if (device->driver->report_fixup) { /* * device->driver->report_fixup() needs to work * on a copy of our report descriptor so it can * change it. */ __u8 *buf = kmemdup(start, size, GFP_KERNEL); if (buf == NULL) return -ENOMEM; start = device->driver->report_fixup(device, buf, &size); /* * The second kmemdup is required in case report_fixup() returns * a static read-only memory, but we have no idea if that memory * needs to be cleaned up or not at the end. */ start = kmemdup(start, size, GFP_KERNEL); kfree(buf); if (start == NULL) return -ENOMEM; } device->rdesc = start; device->rsize = size; parser = vzalloc(sizeof(struct hid_parser)); if (!parser) { ret = -ENOMEM; goto alloc_err; } parser->device = device; end = start + size; device->collection = kcalloc(HID_DEFAULT_NUM_COLLECTIONS, sizeof(struct hid_collection), GFP_KERNEL); if (!device->collection) { ret = -ENOMEM; goto err; } device->collection_size = HID_DEFAULT_NUM_COLLECTIONS; for (i = 0; i < HID_DEFAULT_NUM_COLLECTIONS; i++) device->collection[i].parent_idx = -1; ret = -EINVAL; while ((next = fetch_item(start, end, &item)) != NULL) { start = next; if (item.format != HID_ITEM_FORMAT_SHORT) { hid_err(device, "unexpected long global item\n"); goto err; } if (dispatch_type[item.type](parser, &item)) { hid_err(device, "item %u %u %u %u parsing failed\n", item.format, (unsigned)item.size, (unsigned)item.type, (unsigned)item.tag); goto err; } if (start == end) { if (parser->collection_stack_ptr) { hid_err(device, "unbalanced collection at end of report description\n"); goto err; } if (parser->local.delimiter_depth) { hid_err(device, "unbalanced delimiter at end of report description\n"); goto err; } /* * fetch initial values in case the device's * default multiplier isn't the recommended 1 */ hid_setup_resolution_multiplier(device); kfree(parser->collection_stack); vfree(parser); device->status |= HID_STAT_PARSED; return 0; } } hid_err(device, "item fetching failed at offset %u/%u\n", size - (unsigned int)(end - start), size); err: kfree(parser->collection_stack); alloc_err: vfree(parser); hid_close_report(device); return ret; } EXPORT_SYMBOL_GPL(hid_open_report); /* * Extract/implement a data field from/to a little endian report (bit array). * * Code sort-of follows HID spec: * http://www.usb.org/developers/hidpage/HID1_11.pdf * * While the USB HID spec allows unlimited length bit fields in "report * descriptors", most devices never use more than 16 bits. * One model of UPS is claimed to report "LINEV" as a 32-bit field. * Search linux-kernel and linux-usb-devel archives for "hid-core extract". */ static u32 __extract(u8 *report, unsigned offset, int n) { unsigned int idx = offset / 8; unsigned int bit_nr = 0; unsigned int bit_shift = offset % 8; int bits_to_copy = 8 - bit_shift; u32 value = 0; u32 mask = n < 32 ? (1U << n) - 1 : ~0U; while (n > 0) { value |= ((u32)report[idx] >> bit_shift) << bit_nr; n -= bits_to_copy; bit_nr += bits_to_copy; bits_to_copy = 8; bit_shift = 0; idx++; } return value & mask; } u32 hid_field_extract(const struct hid_device *hid, u8 *report, unsigned offset, unsigned n) { if (n > 32) { hid_warn_once(hid, "%s() called with n (%d) > 32! (%s)\n", __func__, n, current->comm); n = 32; } return __extract(report, offset, n); } EXPORT_SYMBOL_GPL(hid_field_extract); /* * "implement" : set bits in a little endian bit stream. * Same concepts as "extract" (see comments above). * The data mangled in the bit stream remains in little endian * order the whole time. It make more sense to talk about * endianness of register values by considering a register * a "cached" copy of the little endian bit stream. */ static void __implement(u8 *report, unsigned offset, int n, u32 value) { unsigned int idx = offset / 8; unsigned int bit_shift = offset % 8; int bits_to_set = 8 - bit_shift; while (n - bits_to_set >= 0) { report[idx] &= ~(0xff << bit_shift); report[idx] |= value << bit_shift; value >>= bits_to_set; n -= bits_to_set; bits_to_set = 8; bit_shift = 0; idx++; } /* last nibble */ if (n) { u8 bit_mask = ((1U << n) - 1); report[idx] &= ~(bit_mask << bit_shift); report[idx] |= value << bit_shift; } } static void implement(const struct hid_device *hid, u8 *report, unsigned offset, unsigned n, u32 value) { if (unlikely(n > 32)) { hid_warn(hid, "%s() called with n (%d) > 32! (%s)\n", __func__, n, current->comm); n = 32; } else if (n < 32) { u32 m = (1U << n) - 1; if (unlikely(value > m)) { hid_warn(hid, "%s() called with too large value %d (n: %d)! (%s)\n", __func__, value, n, current->comm); value &= m; } } __implement(report, offset, n, value); } /* * Search an array for a value. */ static int search(__s32 *array, __s32 value, unsigned n) { while (n--) { if (*array++ == value) return 0; } return -1; } /** * hid_match_report - check if driver's raw_event should be called * * @hid: hid device * @report: hid report to match against * * compare hid->driver->report_table->report_type to report->type */ static int hid_match_report(struct hid_device *hid, struct hid_report *report) { const struct hid_report_id *id = hid->driver->report_table; if (!id) /* NULL means all */ return 1; for (; id->report_type != HID_TERMINATOR; id++) if (id->report_type == HID_ANY_ID || id->report_type == report->type) return 1; return 0; } /** * hid_match_usage - check if driver's event should be called * * @hid: hid device * @usage: usage to match against * * compare hid->driver->usage_table->usage_{type,code} to * usage->usage_{type,code} */ static int hid_match_usage(struct hid_device *hid, struct hid_usage *usage) { const struct hid_usage_id *id = hid->driver->usage_table; if (!id) /* NULL means all */ return 1; for (; id->usage_type != HID_ANY_ID - 1; id++) if ((id->usage_hid == HID_ANY_ID || id->usage_hid == usage->hid) && (id->usage_type == HID_ANY_ID || id->usage_type == usage->type) && (id->usage_code == HID_ANY_ID || id->usage_code == usage->code)) return 1; return 0; } static void hid_process_event(struct hid_device *hid, struct hid_field *field, struct hid_usage *usage, __s32 value, int interrupt) { struct hid_driver *hdrv = hid->driver; int ret; if (!list_empty(&hid->debug_list)) hid_dump_input(hid, usage, value); if (hdrv && hdrv->event && hid_match_usage(hid, usage)) { ret = hdrv->event(hid, field, usage, value); if (ret != 0) { if (ret < 0) hid_err(hid, "%s's event failed with %d\n", hdrv->name, ret); return; } } if (hid->claimed & HID_CLAIMED_INPUT) hidinput_hid_event(hid, field, usage, value); if (hid->claimed & HID_CLAIMED_HIDDEV && interrupt && hid->hiddev_hid_event) hid->hiddev_hid_event(hid, field, usage, value); } /* * Checks if the given value is valid within this field */ static inline int hid_array_value_is_valid(struct hid_field *field, __s32 value) { __s32 min = field->logical_minimum; /* * Value needs to be between logical min and max, and * (value - min) is used as an index in the usage array. * This array is of size field->maxusage */ return value >= min && value <= field->logical_maximum && value - min < field->maxusage; } /* * Fetch the field from the data. The field content is stored for next * report processing (we do differential reporting to the layer). */ static void hid_input_fetch_field(struct hid_device *hid, struct hid_field *field, __u8 *data) { unsigned n; unsigned count = field->report_count; unsigned offset = field->report_offset; unsigned size = field->report_size; __s32 min = field->logical_minimum; __s32 *value; value = field->new_value; memset(value, 0, count * sizeof(__s32)); field->ignored = false; for (n = 0; n < count; n++) { value[n] = min < 0 ? snto32(hid_field_extract(hid, data, offset + n * size, size), size) : hid_field_extract(hid, data, offset + n * size, size); /* Ignore report if ErrorRollOver */ if (!(field->flags & HID_MAIN_ITEM_VARIABLE) && hid_array_value_is_valid(field, value[n]) && field->usage[value[n] - min].hid == HID_UP_KEYBOARD + 1) { field->ignored = true; return; } } } /* * Process a received variable field. */ static void hid_input_var_field(struct hid_device *hid, struct hid_field *field, int interrupt) { unsigned int count = field->report_count; __s32 *value = field->new_value; unsigned int n; for (n = 0; n < count; n++) hid_process_event(hid, field, &field->usage[n], value[n], interrupt); memcpy(field->value, value, count * sizeof(__s32)); } /* * Process a received array field. The field content is stored for * next report processing (we do differential reporting to the layer). */ static void hid_input_array_field(struct hid_device *hid, struct hid_field *field, int interrupt) { unsigned int n; unsigned int count = field->report_count; __s32 min = field->logical_minimum; __s32 *value; value = field->new_value; /* ErrorRollOver */ if (field->ignored) return; for (n = 0; n < count; n++) { if (hid_array_value_is_valid(field, field->value[n]) && search(value, field->value[n], count)) hid_process_event(hid, field, &field->usage[field->value[n] - min], 0, interrupt); if (hid_array_value_is_valid(field, value[n]) && search(field->value, value[n], count)) hid_process_event(hid, field, &field->usage[value[n] - min], 1, interrupt); } memcpy(field->value, value, count * sizeof(__s32)); } /* * Analyse a received report, and fetch the data from it. The field * content is stored for next report processing (we do differential * reporting to the layer). */ static void hid_process_report(struct hid_device *hid, struct hid_report *report, __u8 *data, int interrupt) { unsigned int a; struct hid_field_entry *entry; struct hid_field *field; /* first retrieve all incoming values in data */ for (a = 0; a < report->maxfield; a++) hid_input_fetch_field(hid, report->field[a], data); if (!list_empty(&report->field_entry_list)) { /* INPUT_REPORT, we have a priority list of fields */ list_for_each_entry(entry, &report->field_entry_list, list) { field = entry->field; if (field->flags & HID_MAIN_ITEM_VARIABLE) hid_process_event(hid, field, &field->usage[entry->index], field->new_value[entry->index], interrupt); else hid_input_array_field(hid, field, interrupt); } /* we need to do the memcpy at the end for var items */ for (a = 0; a < report->maxfield; a++) { field = report->field[a]; if (field->flags & HID_MAIN_ITEM_VARIABLE) memcpy(field->value, field->new_value, field->report_count * sizeof(__s32)); } } else { /* FEATURE_REPORT, regular processing */ for (a = 0; a < report->maxfield; a++) { field = report->field[a]; if (field->flags & HID_MAIN_ITEM_VARIABLE) hid_input_var_field(hid, field, interrupt); else hid_input_array_field(hid, field, interrupt); } } } /* * Insert a given usage_index in a field in the list * of processed usages in the report. * * The elements of lower priority score are processed * first. */ static void __hid_insert_field_entry(struct hid_device *hid, struct hid_report *report, struct hid_field_entry *entry, struct hid_field *field, unsigned int usage_index) { struct hid_field_entry *next; entry->field = field; entry->index = usage_index; entry->priority = field->usages_priorities[usage_index]; /* insert the element at the correct position */ list_for_each_entry(next, &report->field_entry_list, list) { /* * the priority of our element is strictly higher * than the next one, insert it before */ if (entry->priority > next->priority) { list_add_tail(&entry->list, &next->list); return; } } /* lowest priority score: insert at the end */ list_add_tail(&entry->list, &report->field_entry_list); } static void hid_report_process_ordering(struct hid_device *hid, struct hid_report *report) { struct hid_field *field; struct hid_field_entry *entries; unsigned int a, u, usages; unsigned int count = 0; /* count the number of individual fields in the report */ for (a = 0; a < report->maxfield; a++) { field = report->field[a]; if (field->flags & HID_MAIN_ITEM_VARIABLE) count += field->report_count; else count++; } /* allocate the memory to process the fields */ entries = kcalloc(count, sizeof(*entries), GFP_KERNEL); if (!entries) return; report->field_entries = entries; /* * walk through all fields in the report and * store them by priority order in report->field_entry_list * * - Var elements are individualized (field + usage_index) * - Arrays are taken as one, we can not chose an order for them */ usages = 0; for (a = 0; a < report->maxfield; a++) { field = report->field[a]; if (field->flags & HID_MAIN_ITEM_VARIABLE) { for (u = 0; u < field->report_count; u++) { __hid_insert_field_entry(hid, report, &entries[usages], field, u); usages++; } } else { __hid_insert_field_entry(hid, report, &entries[usages], field, 0); usages++; } } } static void hid_process_ordering(struct hid_device *hid) { struct hid_report *report; struct hid_report_enum *report_enum = &hid->report_enum[HID_INPUT_REPORT]; list_for_each_entry(report, &report_enum->report_list, list) hid_report_process_ordering(hid, report); } /* * Output the field into the report. */ static void hid_output_field(const struct hid_device *hid, struct hid_field *field, __u8 *data) { unsigned count = field->report_count; unsigned offset = field->report_offset; unsigned size = field->report_size; unsigned n; for (n = 0; n < count; n++) { if (field->logical_minimum < 0) /* signed values */ implement(hid, data, offset + n * size, size, s32ton(field->value[n], size)); else /* unsigned values */ implement(hid, data, offset + n * size, size, field->value[n]); } } /* * Compute the size of a report. */ static size_t hid_compute_report_size(struct hid_report *report) { if (report->size) return ((report->size - 1) >> 3) + 1; return 0; } /* * Create a report. 'data' has to be allocated using * hid_alloc_report_buf() so that it has proper size. */ void hid_output_report(struct hid_report *report, __u8 *data) { unsigned n; if (report->id > 0) *data++ = report->id; memset(data, 0, hid_compute_report_size(report)); for (n = 0; n < report->maxfield; n++) hid_output_field(report->device, report->field[n], data); } EXPORT_SYMBOL_GPL(hid_output_report); /* * Allocator for buffer that is going to be passed to hid_output_report() */ u8 *hid_alloc_report_buf(struct hid_report *report, gfp_t flags) { /* * 7 extra bytes are necessary to achieve proper functionality * of implement() working on 8 byte chunks */ u32 len = hid_report_len(report) + 7; return kzalloc(len, flags); } EXPORT_SYMBOL_GPL(hid_alloc_report_buf); /* * Set a field value. The report this field belongs to has to be * created and transferred to the device, to set this value in the * device. */ int hid_set_field(struct hid_field *field, unsigned offset, __s32 value) { unsigned size; if (!field) return -1; size = field->report_size; hid_dump_input(field->report->device, field->usage + offset, value); if (offset >= field->report_count) { hid_err(field->report->device, "offset (%d) exceeds report_count (%d)\n", offset, field->report_count); return -1; } if (field->logical_minimum < 0) { if (value != snto32(s32ton(value, size), size)) { hid_err(field->report->device, "value %d is out of range\n", value); return -1; } } field->value[offset] = value; return 0; } EXPORT_SYMBOL_GPL(hid_set_field); struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_type, unsigned int application, unsigned int usage) { struct list_head *report_list = &hdev->report_enum[report_type].report_list; struct hid_report *report; int i, j; list_for_each_entry(report, report_list, list) { if (report->application != application) continue; for (i = 0; i < report->maxfield; i++) { struct hid_field *field = report->field[i]; for (j = 0; j < field->maxusage; j++) { if (field->usage[j].hid == usage) return field; } } } return NULL; } EXPORT_SYMBOL_GPL(hid_find_field); static struct hid_report *hid_get_report(struct hid_report_enum *report_enum, const u8 *data) { struct hid_report *report; unsigned int n = 0; /* Normally report number is 0 */ /* Device uses numbered reports, data[0] is report number */ if (report_enum->numbered) n = *data; report = report_enum->report_id_hash[n]; if (report == NULL) dbg_hid("undefined report_id %u received\n", n); return report; } /* * Implement a generic .request() callback, using .raw_request() * DO NOT USE in hid drivers directly, but through hid_hw_request instead. */ int __hid_request(struct hid_device *hid, struct hid_report *report, enum hid_class_request reqtype) { char *buf; int ret; u32 len; buf = hid_alloc_report_buf(report, GFP_KERNEL); if (!buf) return -ENOMEM; len = hid_report_len(report); if (reqtype == HID_REQ_SET_REPORT) hid_output_report(report, buf); ret = hid->ll_driver->raw_request(hid, report->id, buf, len, report->type, reqtype); if (ret < 0) { dbg_hid("unable to complete request: %d\n", ret); goto out; } if (reqtype == HID_REQ_GET_REPORT) hid_input_report(hid, report->type, buf, ret, 0); ret = 0; out: kfree(buf); return ret; } EXPORT_SYMBOL_GPL(__hid_request); int hid_report_raw_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt) { struct hid_report_enum *report_enum = hid->report_enum + type; struct hid_report *report; struct hid_driver *hdrv; int max_buffer_size = HID_MAX_BUFFER_SIZE; u32 rsize, csize = size; u8 *cdata = data; int ret = 0; report = hid_get_report(report_enum, data); if (!report) goto out; if (report_enum->numbered) { cdata++; csize--; } rsize = hid_compute_report_size(report); if (hid->ll_driver->max_buffer_size) max_buffer_size = hid->ll_driver->max_buffer_size; if (report_enum->numbered && rsize >= max_buffer_size) rsize = max_buffer_size - 1; else if (rsize > max_buffer_size) rsize = max_buffer_size; if (csize < rsize) { dbg_hid("report %d is too short, (%d < %d)\n", report->id, csize, rsize); memset(cdata + csize, 0, rsize - csize); } if ((hid->claimed & HID_CLAIMED_HIDDEV) && hid->hiddev_report_event) hid->hiddev_report_event(hid, report); if (hid->claimed & HID_CLAIMED_HIDRAW) { ret = hidraw_report_event(hid, data, size); if (ret) goto out; } if (hid->claimed != HID_CLAIMED_HIDRAW && report->maxfield) { hid_process_report(hid, report, cdata, interrupt); hdrv = hid->driver; if (hdrv && hdrv->report) hdrv->report(hid, report); } if (hid->claimed & HID_CLAIMED_INPUT) hidinput_report_event(hid, report); out: return ret; } EXPORT_SYMBOL_GPL(hid_report_raw_event); static int __hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt, u64 source, bool from_bpf, bool lock_already_taken) { struct hid_report_enum *report_enum; struct hid_driver *hdrv; struct hid_report *report; int ret = 0; if (!hid) return -ENODEV; ret = down_trylock(&hid->driver_input_lock); if (lock_already_taken && !ret) { up(&hid->driver_input_lock); return -EINVAL; } else if (!lock_already_taken && ret) { return -EBUSY; } if (!hid->driver) { ret = -ENODEV; goto unlock; } report_enum = hid->report_enum + type; hdrv = hid->driver; data = dispatch_hid_bpf_device_event(hid, type, data, &size, interrupt, source, from_bpf); if (IS_ERR(data)) { ret = PTR_ERR(data); goto unlock; } if (!size) { dbg_hid("empty report\n"); ret = -1; goto unlock; } /* Avoid unnecessary overhead if debugfs is disabled */ if (!list_empty(&hid->debug_list)) hid_dump_report(hid, type, data, size); report = hid_get_report(report_enum, data); if (!report) { ret = -1; goto unlock; } if (hdrv && hdrv->raw_event && hid_match_report(hid, report)) { ret = hdrv->raw_event(hid, report, data, size); if (ret < 0) goto unlock; } ret = hid_report_raw_event(hid, type, data, size, interrupt); unlock: if (!lock_already_taken) up(&hid->driver_input_lock); return ret; } /** * hid_input_report - report data from lower layer (usb, bt...) * * @hid: hid device * @type: HID report type (HID_*_REPORT) * @data: report contents * @size: size of data parameter * @interrupt: distinguish between interrupt and control transfers * * This is data entry for lower layers. */ int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt) { return __hid_input_report(hid, type, data, size, interrupt, 0, false, /* from_bpf */ false /* lock_already_taken */); } EXPORT_SYMBOL_GPL(hid_input_report); bool hid_match_one_id(const struct hid_device *hdev, const struct hid_device_id *id) { return (id->bus == HID_BUS_ANY || id->bus == hdev->bus) && (id->group == HID_GROUP_ANY || id->group == hdev->group) && (id->vendor == HID_ANY_ID || id->vendor == hdev->vendor) && (id->product == HID_ANY_ID || id->product == hdev->product); } const struct hid_device_id *hid_match_id(const struct hid_device *hdev, const struct hid_device_id *id) { for (; id->bus; id++) if (hid_match_one_id(hdev, id)) return id; return NULL; } EXPORT_SYMBOL_GPL(hid_match_id); static const struct hid_device_id hid_hiddev_list[] = { { HID_USB_DEVICE(USB_VENDOR_ID_MGE, USB_DEVICE_ID_MGE_UPS) }, { HID_USB_DEVICE(USB_VENDOR_ID_MGE, USB_DEVICE_ID_MGE_UPS1) }, { } }; static bool hid_hiddev(struct hid_device *hdev) { return !!hid_match_id(hdev, hid_hiddev_list); } static ssize_t read_report_descriptor(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); struct hid_device *hdev = to_hid_device(dev); if (off >= hdev->rsize) return 0; if (off + count > hdev->rsize) count = hdev->rsize - off; memcpy(buf, hdev->rdesc + off, count); return count; } static ssize_t show_country(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); return sprintf(buf, "%02x\n", hdev->country & 0xff); } static struct bin_attribute dev_bin_attr_report_desc = { .attr = { .name = "report_descriptor", .mode = 0444 }, .read = read_report_descriptor, .size = HID_MAX_DESCRIPTOR_SIZE, }; static const struct device_attribute dev_attr_country = { .attr = { .name = "country", .mode = 0444 }, .show = show_country, }; int hid_connect(struct hid_device *hdev, unsigned int connect_mask) { static const char *types[] = { "Device", "Pointer", "Mouse", "Device", "Joystick", "Gamepad", "Keyboard", "Keypad", "Multi-Axis Controller" }; const char *type, *bus; char buf[64] = ""; unsigned int i; int len; int ret; ret = hid_bpf_connect_device(hdev); if (ret) return ret; if (hdev->quirks & HID_QUIRK_HIDDEV_FORCE) connect_mask |= (HID_CONNECT_HIDDEV_FORCE | HID_CONNECT_HIDDEV); if (hdev->quirks & HID_QUIRK_HIDINPUT_FORCE) connect_mask |= HID_CONNECT_HIDINPUT_FORCE; if (hdev->bus != BUS_USB) connect_mask &= ~HID_CONNECT_HIDDEV; if (hid_hiddev(hdev)) connect_mask |= HID_CONNECT_HIDDEV_FORCE; if ((connect_mask & HID_CONNECT_HIDINPUT) && !hidinput_connect(hdev, connect_mask & HID_CONNECT_HIDINPUT_FORCE)) hdev->claimed |= HID_CLAIMED_INPUT; if ((connect_mask & HID_CONNECT_HIDDEV) && hdev->hiddev_connect && !hdev->hiddev_connect(hdev, connect_mask & HID_CONNECT_HIDDEV_FORCE)) hdev->claimed |= HID_CLAIMED_HIDDEV; if ((connect_mask & HID_CONNECT_HIDRAW) && !hidraw_connect(hdev)) hdev->claimed |= HID_CLAIMED_HIDRAW; if (connect_mask & HID_CONNECT_DRIVER) hdev->claimed |= HID_CLAIMED_DRIVER; /* Drivers with the ->raw_event callback set are not required to connect * to any other listener. */ if (!hdev->claimed && !hdev->driver->raw_event) { hid_err(hdev, "device has no listeners, quitting\n"); return -ENODEV; } hid_process_ordering(hdev); if ((hdev->claimed & HID_CLAIMED_INPUT) && (connect_mask & HID_CONNECT_FF) && hdev->ff_init) hdev->ff_init(hdev); len = 0; if (hdev->claimed & HID_CLAIMED_INPUT) len += sprintf(buf + len, "input"); if (hdev->claimed & HID_CLAIMED_HIDDEV) len += sprintf(buf + len, "%shiddev%d", len ? "," : "", ((struct hiddev *)hdev->hiddev)->minor); if (hdev->claimed & HID_CLAIMED_HIDRAW) len += sprintf(buf + len, "%shidraw%d", len ? "," : "", ((struct hidraw *)hdev->hidraw)->minor); type = "Device"; for (i = 0; i < hdev->maxcollection; i++) { struct hid_collection *col = &hdev->collection[i]; if (col->type == HID_COLLECTION_APPLICATION && (col->usage & HID_USAGE_PAGE) == HID_UP_GENDESK && (col->usage & 0xffff) < ARRAY_SIZE(types)) { type = types[col->usage & 0xffff]; break; } } switch (hdev->bus) { case BUS_USB: bus = "USB"; break; case BUS_BLUETOOTH: bus = "BLUETOOTH"; break; case BUS_I2C: bus = "I2C"; break; case BUS_VIRTUAL: bus = "VIRTUAL"; break; case BUS_INTEL_ISHTP: case BUS_AMD_SFH: bus = "SENSOR HUB"; break; default: bus = "<UNKNOWN>"; } ret = device_create_file(&hdev->dev, &dev_attr_country); if (ret) hid_warn(hdev, "can't create sysfs country code attribute err: %d\n", ret); hid_info(hdev, "%s: %s HID v%x.%02x %s [%s] on %s\n", buf, bus, hdev->version >> 8, hdev->version & 0xff, type, hdev->name, hdev->phys); return 0; } EXPORT_SYMBOL_GPL(hid_connect); void hid_disconnect(struct hid_device *hdev) { device_remove_file(&hdev->dev, &dev_attr_country); if (hdev->claimed & HID_CLAIMED_INPUT) hidinput_disconnect(hdev); if (hdev->claimed & HID_CLAIMED_HIDDEV) hdev->hiddev_disconnect(hdev); if (hdev->claimed & HID_CLAIMED_HIDRAW) hidraw_disconnect(hdev); hdev->claimed = 0; hid_bpf_disconnect_device(hdev); } EXPORT_SYMBOL_GPL(hid_disconnect); /** * hid_hw_start - start underlying HW * @hdev: hid device * @connect_mask: which outputs to connect, see HID_CONNECT_* * * Call this in probe function *after* hid_parse. This will setup HW * buffers and start the device (if not defeirred to device open). * hid_hw_stop must be called if this was successful. */ int hid_hw_start(struct hid_device *hdev, unsigned int connect_mask) { int error; error = hdev->ll_driver->start(hdev); if (error) return error; if (connect_mask) { error = hid_connect(hdev, connect_mask); if (error) { hdev->ll_driver->stop(hdev); return error; } } return 0; } EXPORT_SYMBOL_GPL(hid_hw_start); /** * hid_hw_stop - stop underlying HW * @hdev: hid device * * This is usually called from remove function or from probe when something * failed and hid_hw_start was called already. */ void hid_hw_stop(struct hid_device *hdev) { hid_disconnect(hdev); hdev->ll_driver->stop(hdev); } EXPORT_SYMBOL_GPL(hid_hw_stop); /** * hid_hw_open - signal underlying HW to start delivering events * @hdev: hid device * * Tell underlying HW to start delivering events from the device. * This function should be called sometime after successful call * to hid_hw_start(). */ int hid_hw_open(struct hid_device *hdev) { int ret; ret = mutex_lock_killable(&hdev->ll_open_lock); if (ret) return ret; if (!hdev->ll_open_count++) { ret = hdev->ll_driver->open(hdev); if (ret) hdev->ll_open_count--; } mutex_unlock(&hdev->ll_open_lock); return ret; } EXPORT_SYMBOL_GPL(hid_hw_open); /** * hid_hw_close - signal underlaying HW to stop delivering events * * @hdev: hid device * * This function indicates that we are not interested in the events * from this device anymore. Delivery of events may or may not stop, * depending on the number of users still outstanding. */ void hid_hw_close(struct hid_device *hdev) { mutex_lock(&hdev->ll_open_lock); if (!--hdev->ll_open_count) hdev->ll_driver->close(hdev); mutex_unlock(&hdev->ll_open_lock); } EXPORT_SYMBOL_GPL(hid_hw_close); /** * hid_hw_request - send report request to device * * @hdev: hid device * @report: report to send * @reqtype: hid request type */ void hid_hw_request(struct hid_device *hdev, struct hid_report *report, enum hid_class_request reqtype) { if (hdev->ll_driver->request) return hdev->ll_driver->request(hdev, report, reqtype); __hid_request(hdev, report, reqtype); } EXPORT_SYMBOL_GPL(hid_hw_request); int __hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype, u64 source, bool from_bpf) { unsigned int max_buffer_size = HID_MAX_BUFFER_SIZE; int ret; if (hdev->ll_driver->max_buffer_size) max_buffer_size = hdev->ll_driver->max_buffer_size; if (len < 1 || len > max_buffer_size || !buf) return -EINVAL; ret = dispatch_hid_bpf_raw_requests(hdev, reportnum, buf, len, rtype, reqtype, source, from_bpf); if (ret) return ret; return hdev->ll_driver->raw_request(hdev, reportnum, buf, len, rtype, reqtype); } /** * hid_hw_raw_request - send report request to device * * @hdev: hid device * @reportnum: report ID * @buf: in/out data to transfer * @len: length of buf * @rtype: HID report type * @reqtype: HID_REQ_GET_REPORT or HID_REQ_SET_REPORT * * Return: count of data transferred, negative if error * * Same behavior as hid_hw_request, but with raw buffers instead. */ int hid_hw_raw_request(struct hid_device *hdev, unsigned char reportnum, __u8 *buf, size_t len, enum hid_report_type rtype, enum hid_class_request reqtype) { return __hid_hw_raw_request(hdev, reportnum, buf, len, rtype, reqtype, 0, false); } EXPORT_SYMBOL_GPL(hid_hw_raw_request); int __hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len, u64 source, bool from_bpf) { unsigned int max_buffer_size = HID_MAX_BUFFER_SIZE; int ret; if (hdev->ll_driver->max_buffer_size) max_buffer_size = hdev->ll_driver->max_buffer_size; if (len < 1 || len > max_buffer_size || !buf) return -EINVAL; ret = dispatch_hid_bpf_output_report(hdev, buf, len, source, from_bpf); if (ret) return ret; if (hdev->ll_driver->output_report) return hdev->ll_driver->output_report(hdev, buf, len); return -ENOSYS; } /** * hid_hw_output_report - send output report to device * * @hdev: hid device * @buf: raw data to transfer * @len: length of buf * * Return: count of data transferred, negative if error */ int hid_hw_output_report(struct hid_device *hdev, __u8 *buf, size_t len) { return __hid_hw_output_report(hdev, buf, len, 0, false); } EXPORT_SYMBOL_GPL(hid_hw_output_report); #ifdef CONFIG_PM int hid_driver_suspend(struct hid_device *hdev, pm_message_t state) { if (hdev->driver && hdev->driver->suspend) return hdev->driver->suspend(hdev, state); return 0; } EXPORT_SYMBOL_GPL(hid_driver_suspend); int hid_driver_reset_resume(struct hid_device *hdev) { if (hdev->driver && hdev->driver->reset_resume) return hdev->driver->reset_resume(hdev); return 0; } EXPORT_SYMBOL_GPL(hid_driver_reset_resume); int hid_driver_resume(struct hid_device *hdev) { if (hdev->driver && hdev->driver->resume) return hdev->driver->resume(hdev); return 0; } EXPORT_SYMBOL_GPL(hid_driver_resume); #endif /* CONFIG_PM */ struct hid_dynid { struct list_head list; struct hid_device_id id; }; /** * new_id_store - add a new HID device ID to this driver and re-probe devices * @drv: target device driver * @buf: buffer for scanning device ID data * @count: input size * * Adds a new dynamic hid device ID to this driver, * and causes the driver to probe for all devices again. */ static ssize_t new_id_store(struct device_driver *drv, const char *buf, size_t count) { struct hid_driver *hdrv = to_hid_driver(drv); struct hid_dynid *dynid; __u32 bus, vendor, product; unsigned long driver_data = 0; int ret; ret = sscanf(buf, "%x %x %x %lx", &bus, &vendor, &product, &driver_data); if (ret < 3) return -EINVAL; dynid = kzalloc(sizeof(*dynid), GFP_KERNEL); if (!dynid) return -ENOMEM; dynid->id.bus = bus; dynid->id.group = HID_GROUP_ANY; dynid->id.vendor = vendor; dynid->id.product = product; dynid->id.driver_data = driver_data; spin_lock(&hdrv->dyn_lock); list_add_tail(&dynid->list, &hdrv->dyn_list); spin_unlock(&hdrv->dyn_lock); ret = driver_attach(&hdrv->driver); return ret ? : count; } static DRIVER_ATTR_WO(new_id); static struct attribute *hid_drv_attrs[] = { &driver_attr_new_id.attr, NULL, }; ATTRIBUTE_GROUPS(hid_drv); static void hid_free_dynids(struct hid_driver *hdrv) { struct hid_dynid *dynid, *n; spin_lock(&hdrv->dyn_lock); list_for_each_entry_safe(dynid, n, &hdrv->dyn_list, list) { list_del(&dynid->list); kfree(dynid); } spin_unlock(&hdrv->dyn_lock); } const struct hid_device_id *hid_match_device(struct hid_device *hdev, struct hid_driver *hdrv) { struct hid_dynid *dynid; spin_lock(&hdrv->dyn_lock); list_for_each_entry(dynid, &hdrv->dyn_list, list) { if (hid_match_one_id(hdev, &dynid->id)) { spin_unlock(&hdrv->dyn_lock); return &dynid->id; } } spin_unlock(&hdrv->dyn_lock); return hid_match_id(hdev, hdrv->id_table); } EXPORT_SYMBOL_GPL(hid_match_device); static int hid_bus_match(struct device *dev, const struct device_driver *drv) { struct hid_driver *hdrv = to_hid_driver(drv); struct hid_device *hdev = to_hid_device(dev); return hid_match_device(hdev, hdrv) != NULL; } /** * hid_compare_device_paths - check if both devices share the same path * @hdev_a: hid device * @hdev_b: hid device * @separator: char to use as separator * * Check if two devices share the same path up to the last occurrence of * the separator char. Both paths must exist (i.e., zero-length paths * don't match). */ bool hid_compare_device_paths(struct hid_device *hdev_a, struct hid_device *hdev_b, char separator) { int n1 = strrchr(hdev_a->phys, separator) - hdev_a->phys; int n2 = strrchr(hdev_b->phys, separator) - hdev_b->phys; if (n1 != n2 || n1 <= 0 || n2 <= 0) return false; return !strncmp(hdev_a->phys, hdev_b->phys, n1); } EXPORT_SYMBOL_GPL(hid_compare_device_paths); static bool hid_check_device_match(struct hid_device *hdev, struct hid_driver *hdrv, const struct hid_device_id **id) { *id = hid_match_device(hdev, hdrv); if (!*id) return false; if (hdrv->match) return hdrv->match(hdev, hid_ignore_special_drivers); /* * hid-generic implements .match(), so we must be dealing with a * different HID driver here, and can simply check if * hid_ignore_special_drivers or HID_QUIRK_IGNORE_SPECIAL_DRIVER * are set or not. */ return !hid_ignore_special_drivers && !(hdev->quirks & HID_QUIRK_IGNORE_SPECIAL_DRIVER); } static int __hid_device_probe(struct hid_device *hdev, struct hid_driver *hdrv) { const struct hid_device_id *id; int ret; if (!hdev->bpf_rsize) { /* in case a bpf program gets detached, we need to free the old one */ hid_free_bpf_rdesc(hdev); /* keep this around so we know we called it once */ hdev->bpf_rsize = hdev->dev_rsize; /* call_hid_bpf_rdesc_fixup will always return a valid pointer */ hdev->bpf_rdesc = call_hid_bpf_rdesc_fixup(hdev, hdev->dev_rdesc, &hdev->bpf_rsize); } if (!hid_check_device_match(hdev, hdrv, &id)) return -ENODEV; hdev->devres_group_id = devres_open_group(&hdev->dev, NULL, GFP_KERNEL); if (!hdev->devres_group_id) return -ENOMEM; /* reset the quirks that has been previously set */ hdev->quirks = hid_lookup_quirk(hdev); hdev->driver = hdrv; if (hdrv->probe) { ret = hdrv->probe(hdev, id); } else { /* default probe */ ret = hid_open_report(hdev); if (!ret) ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); } /* * Note that we are not closing the devres group opened above so * even resources that were attached to the device after probe is * run are released when hid_device_remove() is executed. This is * needed as some drivers would allocate additional resources, * for example when updating firmware. */ if (ret) { devres_release_group(&hdev->dev, hdev->devres_group_id); hid_close_report(hdev); hdev->driver = NULL; } return ret; } static int hid_device_probe(struct device *dev) { struct hid_device *hdev = to_hid_device(dev); struct hid_driver *hdrv = to_hid_driver(dev->driver); int ret = 0; if (down_interruptible(&hdev->driver_input_lock)) return -EINTR; hdev->io_started = false; clear_bit(ffs(HID_STAT_REPROBED), &hdev->status); if (!hdev->driver) ret = __hid_device_probe(hdev, hdrv); if (!hdev->io_started) up(&hdev->driver_input_lock); return ret; } static void hid_device_remove(struct device *dev) { struct hid_device *hdev = to_hid_device(dev); struct hid_driver *hdrv; down(&hdev->driver_input_lock); hdev->io_started = false; hdrv = hdev->driver; if (hdrv) { if (hdrv->remove) hdrv->remove(hdev); else /* default remove */ hid_hw_stop(hdev); /* Release all devres resources allocated by the driver */ devres_release_group(&hdev->dev, hdev->devres_group_id); hid_close_report(hdev); hdev->driver = NULL; } if (!hdev->io_started) up(&hdev->driver_input_lock); } static ssize_t modalias_show(struct device *dev, struct device_attribute *a, char *buf) { struct hid_device *hdev = container_of(dev, struct hid_device, dev); return scnprintf(buf, PAGE_SIZE, "hid:b%04Xg%04Xv%08Xp%08X\n", hdev->bus, hdev->group, hdev->vendor, hdev->product); } static DEVICE_ATTR_RO(modalias); static struct attribute *hid_dev_attrs[] = { &dev_attr_modalias.attr, NULL, }; static struct bin_attribute *hid_dev_bin_attrs[] = { &dev_bin_attr_report_desc, NULL }; static const struct attribute_group hid_dev_group = { .attrs = hid_dev_attrs, .bin_attrs = hid_dev_bin_attrs, }; __ATTRIBUTE_GROUPS(hid_dev); static int hid_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct hid_device *hdev = to_hid_device(dev); if (add_uevent_var(env, "HID_ID=%04X:%08X:%08X", hdev->bus, hdev->vendor, hdev->product)) return -ENOMEM; if (add_uevent_var(env, "HID_NAME=%s", hdev->name)) return -ENOMEM; if (add_uevent_var(env, "HID_PHYS=%s", hdev->phys)) return -ENOMEM; if (add_uevent_var(env, "HID_UNIQ=%s", hdev->uniq)) return -ENOMEM; if (add_uevent_var(env, "MODALIAS=hid:b%04Xg%04Xv%08Xp%08X", hdev->bus, hdev->group, hdev->vendor, hdev->product)) return -ENOMEM; return 0; } const struct bus_type hid_bus_type = { .name = "hid", .dev_groups = hid_dev_groups, .drv_groups = hid_drv_groups, .match = hid_bus_match, .probe = hid_device_probe, .remove = hid_device_remove, .uevent = hid_uevent, }; EXPORT_SYMBOL(hid_bus_type); int hid_add_device(struct hid_device *hdev) { static atomic_t id = ATOMIC_INIT(0); int ret; if (WARN_ON(hdev->status & HID_STAT_ADDED)) return -EBUSY; hdev->quirks = hid_lookup_quirk(hdev); /* we need to kill them here, otherwise they will stay allocated to * wait for coming driver */ if (hid_ignore(hdev)) return -ENODEV; /* * Check for the mandatory transport channel. */ if (!hdev->ll_driver->raw_request) { hid_err(hdev, "transport driver missing .raw_request()\n"); return -EINVAL; } /* * Read the device report descriptor once and use as template * for the driver-specific modifications. */ ret = hdev->ll_driver->parse(hdev); if (ret) return ret; if (!hdev->dev_rdesc) return -ENODEV; /* * Scan generic devices for group information */ if (hid_ignore_special_drivers) { hdev->group = HID_GROUP_GENERIC; } else if (!hdev->group && !(hdev->quirks & HID_QUIRK_HAVE_SPECIAL_DRIVER)) { ret = hid_scan_report(hdev); if (ret) hid_warn(hdev, "bad device descriptor (%d)\n", ret); } hdev->id = atomic_inc_return(&id); /* XXX hack, any other cleaner solution after the driver core * is converted to allow more than 20 bytes as the device name? */ dev_set_name(&hdev->dev, "%04X:%04X:%04X.%04X", hdev->bus, hdev->vendor, hdev->product, hdev->id); hid_debug_register(hdev, dev_name(&hdev->dev)); ret = device_add(&hdev->dev); if (!ret) hdev->status |= HID_STAT_ADDED; else hid_debug_unregister(hdev); return ret; } EXPORT_SYMBOL_GPL(hid_add_device); /** * hid_allocate_device - allocate new hid device descriptor * * Allocate and initialize hid device, so that hid_destroy_device might be * used to free it. * * New hid_device pointer is returned on success, otherwise ERR_PTR encoded * error value. */ struct hid_device *hid_allocate_device(void) { struct hid_device *hdev; int ret = -ENOMEM; hdev = kzalloc(sizeof(*hdev), GFP_KERNEL); if (hdev == NULL) return ERR_PTR(ret); device_initialize(&hdev->dev); hdev->dev.release = hid_device_release; hdev->dev.bus = &hid_bus_type; device_enable_async_suspend(&hdev->dev); hid_close_report(hdev); init_waitqueue_head(&hdev->debug_wait); INIT_LIST_HEAD(&hdev->debug_list); spin_lock_init(&hdev->debug_list_lock); sema_init(&hdev->driver_input_lock, 1); mutex_init(&hdev->ll_open_lock); kref_init(&hdev->ref); ret = hid_bpf_device_init(hdev); if (ret) goto out_err; return hdev; out_err: hid_destroy_device(hdev); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(hid_allocate_device); static void hid_remove_device(struct hid_device *hdev) { if (hdev->status & HID_STAT_ADDED) { device_del(&hdev->dev); hid_debug_unregister(hdev); hdev->status &= ~HID_STAT_ADDED; } hid_free_bpf_rdesc(hdev); kfree(hdev->dev_rdesc); hdev->dev_rdesc = NULL; hdev->dev_rsize = 0; hdev->bpf_rsize = 0; } /** * hid_destroy_device - free previously allocated device * * @hdev: hid device * * If you allocate hid_device through hid_allocate_device, you should ever * free by this function. */ void hid_destroy_device(struct hid_device *hdev) { hid_bpf_destroy_device(hdev); hid_remove_device(hdev); put_device(&hdev->dev); } EXPORT_SYMBOL_GPL(hid_destroy_device); static int __hid_bus_reprobe_drivers(struct device *dev, void *data) { struct hid_driver *hdrv = data; struct hid_device *hdev = to_hid_device(dev); if (hdev->driver == hdrv && !hdrv->match(hdev, hid_ignore_special_drivers) && !test_and_set_bit(ffs(HID_STAT_REPROBED), &hdev->status)) return device_reprobe(dev); return 0; } static int __hid_bus_driver_added(struct device_driver *drv, void *data) { struct hid_driver *hdrv = to_hid_driver(drv); if (hdrv->match) { bus_for_each_dev(&hid_bus_type, NULL, hdrv, __hid_bus_reprobe_drivers); } return 0; } static int __bus_removed_driver(struct device_driver *drv, void *data) { return bus_rescan_devices(&hid_bus_type); } int __hid_register_driver(struct hid_driver *hdrv, struct module *owner, const char *mod_name) { int ret; hdrv->driver.name = hdrv->name; hdrv->driver.bus = &hid_bus_type; hdrv->driver.owner = owner; hdrv->driver.mod_name = mod_name; INIT_LIST_HEAD(&hdrv->dyn_list); spin_lock_init(&hdrv->dyn_lock); ret = driver_register(&hdrv->driver); if (ret == 0) bus_for_each_drv(&hid_bus_type, NULL, NULL, __hid_bus_driver_added); return ret; } EXPORT_SYMBOL_GPL(__hid_register_driver); void hid_unregister_driver(struct hid_driver *hdrv) { driver_unregister(&hdrv->driver); hid_free_dynids(hdrv); bus_for_each_drv(&hid_bus_type, NULL, hdrv, __bus_removed_driver); } EXPORT_SYMBOL_GPL(hid_unregister_driver); int hid_check_keys_pressed(struct hid_device *hid) { struct hid_input *hidinput; int i; if (!(hid->claimed & HID_CLAIMED_INPUT)) return 0; list_for_each_entry(hidinput, &hid->inputs, list) { for (i = 0; i < BITS_TO_LONGS(KEY_MAX); i++) if (hidinput->input->key[i]) return 1; } return 0; } EXPORT_SYMBOL_GPL(hid_check_keys_pressed); #ifdef CONFIG_HID_BPF static const struct hid_ops __hid_ops = { .hid_get_report = hid_get_report, .hid_hw_raw_request = __hid_hw_raw_request, .hid_hw_output_report = __hid_hw_output_report, .hid_input_report = __hid_input_report, .owner = THIS_MODULE, .bus_type = &hid_bus_type, }; #endif static int __init hid_init(void) { int ret; ret = bus_register(&hid_bus_type); if (ret) { pr_err("can't register hid bus\n"); goto err; } #ifdef CONFIG_HID_BPF hid_ops = &__hid_ops; #endif ret = hidraw_init(); if (ret) goto err_bus; hid_debug_init(); return 0; err_bus: bus_unregister(&hid_bus_type); err: return ret; } static void __exit hid_exit(void) { #ifdef CONFIG_HID_BPF hid_ops = NULL; #endif hid_debug_exit(); hidraw_exit(); bus_unregister(&hid_bus_type); hid_quirks_exit(HID_BUS_ANY); } module_init(hid_init); module_exit(hid_exit); MODULE_AUTHOR("Andreas Gal"); MODULE_AUTHOR("Vojtech Pavlik"); MODULE_AUTHOR("Jiri Kosina"); MODULE_DESCRIPTION("HID support for Linux"); MODULE_LICENSE("GPL");
53 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 // SPDX-License-Identifier: GPL-2.0-or-later /* * NetLabel NETLINK Interface * * This file defines the NETLINK interface for the NetLabel system. The * NetLabel system manages static and dynamic label mappings for network * protocols such as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 */ #include <linux/init.h> #include <linux/types.h> #include <linux/list.h> #include <linux/socket.h> #include <linux/audit.h> #include <linux/tty.h> #include <linux/security.h> #include <linux/gfp.h> #include <net/sock.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/netlabel.h> #include <asm/bug.h> #include "netlabel_mgmt.h" #include "netlabel_unlabeled.h" #include "netlabel_cipso_v4.h" #include "netlabel_calipso.h" #include "netlabel_user.h" /* * NetLabel NETLINK Setup Functions */ /** * netlbl_netlink_init - Initialize the NETLINK communication channel * * Description: * Call out to the NetLabel components so they can register their families and * commands with the Generic NETLINK mechanism. Returns zero on success and * non-zero on failure. * */ int __init netlbl_netlink_init(void) { int ret_val; ret_val = netlbl_mgmt_genl_init(); if (ret_val != 0) return ret_val; ret_val = netlbl_cipsov4_genl_init(); if (ret_val != 0) return ret_val; ret_val = netlbl_calipso_genl_init(); if (ret_val != 0) return ret_val; return netlbl_unlabel_genl_init(); } /* * NetLabel Audit Functions */ /** * netlbl_audit_start_common - Start an audit message * @type: audit message type * @audit_info: NetLabel audit information * * Description: * Start an audit message using the type specified in @type and fill the audit * message with some fields common to all NetLabel audit messages. Returns * a pointer to the audit buffer on success, NULL on failure. * */ struct audit_buffer *netlbl_audit_start_common(int type, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; char *secctx; u32 secctx_len; if (audit_enabled == AUDIT_OFF) return NULL; audit_buf = audit_log_start(audit_context(), GFP_ATOMIC, type); if (audit_buf == NULL) return NULL; audit_log_format(audit_buf, "netlabel: auid=%u ses=%u", from_kuid(&init_user_ns, audit_info->loginuid), audit_info->sessionid); if (lsmprop_is_set(&audit_info->prop) && security_lsmprop_to_secctx(&audit_info->prop, &secctx, &secctx_len) == 0) { audit_log_format(audit_buf, " subj=%s", secctx); security_release_secctx(secctx, secctx_len); } return audit_buf; }
4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions Copyright (C) 1992 Drew Eckhardt */ #ifndef _LINUX_BLKDEV_H #define _LINUX_BLKDEV_H #include <linux/types.h> #include <linux/blk_types.h> #include <linux/device.h> #include <linux/list.h> #include <linux/llist.h> #include <linux/minmax.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/wait.h> #include <linux/bio.h> #include <linux/gfp.h> #include <linux/kdev_t.h> #include <linux/rcupdate.h> #include <linux/percpu-refcount.h> #include <linux/blkzoned.h> #include <linux/sched.h> #include <linux/sbitmap.h> #include <linux/uuid.h> #include <linux/xarray.h> #include <linux/file.h> #include <linux/lockdep.h> struct module; struct request_queue; struct elevator_queue; struct blk_trace; struct request; struct sg_io_hdr; struct blkcg_gq; struct blk_flush_queue; struct kiocb; struct pr_ops; struct rq_qos; struct blk_queue_stats; struct blk_stat_callback; struct blk_crypto_profile; extern const struct device_type disk_type; extern const struct device_type part_type; extern const struct class block_class; /* * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. */ #define BLKCG_MAX_POLS 6 #define DISK_MAX_PARTS 256 #define DISK_NAME_LEN 32 #define PARTITION_META_INFO_VOLNAMELTH 64 /* * Enough for the string representation of any kind of UUID plus NULL. * EFI UUID is 36 characters. MSDOS UUID is 11 characters. */ #define PARTITION_META_INFO_UUIDLTH (UUID_STRING_LEN + 1) struct partition_meta_info { char uuid[PARTITION_META_INFO_UUIDLTH]; u8 volname[PARTITION_META_INFO_VOLNAMELTH]; }; /** * DOC: genhd capability flags * * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to * removable media. When set, the device remains present even when media is not * inserted. Shall not be set for devices which are removed entirely when the * media is removed. * * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events, * doesn't appear in sysfs, and can't be opened from userspace or using * blkdev_get*. Used for the underlying components of multipath devices. * * ``GENHD_FL_NO_PART``: partition support is disabled. The kernel will not * scan for partitions from add_disk, and users can't add partitions manually. * */ enum { GENHD_FL_REMOVABLE = 1 << 0, GENHD_FL_HIDDEN = 1 << 1, GENHD_FL_NO_PART = 1 << 2, }; enum { DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */ }; enum { /* Poll even if events_poll_msecs is unset */ DISK_EVENT_FLAG_POLL = 1 << 0, /* Forward events to udev */ DISK_EVENT_FLAG_UEVENT = 1 << 1, /* Block event polling when open for exclusive write */ DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE = 1 << 2, }; struct disk_events; struct badblocks; enum blk_integrity_checksum { BLK_INTEGRITY_CSUM_NONE = 0, BLK_INTEGRITY_CSUM_IP = 1, BLK_INTEGRITY_CSUM_CRC = 2, BLK_INTEGRITY_CSUM_CRC64 = 3, } __packed ; struct blk_integrity { unsigned char flags; enum blk_integrity_checksum csum_type; unsigned char tuple_size; unsigned char pi_offset; unsigned char interval_exp; unsigned char tag_size; }; typedef unsigned int __bitwise blk_mode_t; /* open for reading */ #define BLK_OPEN_READ ((__force blk_mode_t)(1 << 0)) /* open for writing */ #define BLK_OPEN_WRITE ((__force blk_mode_t)(1 << 1)) /* open exclusively (vs other exclusive openers */ #define BLK_OPEN_EXCL ((__force blk_mode_t)(1 << 2)) /* opened with O_NDELAY */ #define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3)) /* open for "writes" only for ioctls (specialy hack for floppy.c) */ #define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) /* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */ #define BLK_OPEN_RESTRICT_WRITES ((__force blk_mode_t)(1 << 5)) /* return partition scanning errors */ #define BLK_OPEN_STRICT_SCAN ((__force blk_mode_t)(1 << 6)) struct gendisk { /* * major/first_minor/minors should not be set by any new driver, the * block core will take care of allocating them automatically. */ int major; int first_minor; int minors; char disk_name[DISK_NAME_LEN]; /* name of major driver */ unsigned short events; /* supported events */ unsigned short event_flags; /* flags related to event processing */ struct xarray part_tbl; struct block_device *part0; const struct block_device_operations *fops; struct request_queue *queue; void *private_data; struct bio_set bio_split; int flags; unsigned long state; #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 #define GD_DEAD 2 #define GD_NATIVE_CAPACITY 3 #define GD_ADDED 4 #define GD_SUPPRESS_PART_SCAN 5 #define GD_OWNS_QUEUE 6 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ struct backing_dev_info *bdi; struct kobject queue_kobj; /* the queue/ directory */ struct kobject *slave_dir; #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED struct list_head slave_bdevs; #endif struct timer_rand_state *random; atomic_t sync_io; /* RAID */ struct disk_events *ev; #ifdef CONFIG_BLK_DEV_ZONED /* * Zoned block device information. Reads of this information must be * protected with blk_queue_enter() / blk_queue_exit(). Modifying this * information is only allowed while no requests are being processed. * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue(). */ unsigned int nr_zones; unsigned int zone_capacity; unsigned int last_zone_capacity; unsigned long __rcu *conv_zones_bitmap; unsigned int zone_wplugs_hash_bits; spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) struct cdrom_device_info *cdi; #endif int node_id; struct badblocks *bb; struct lockdep_map lockdep_map; u64 diskseq; blk_mode_t open_mode; /* * Independent sector access ranges. This is always NULL for * devices that do not have multiple independent access ranges. */ struct blk_independent_access_ranges *ia_ranges; }; /** * disk_openers - returns how many openers are there for a disk * @disk: disk to check * * This returns the number of openers for a disk. Note that this value is only * stable if disk->open_mutex is held. * * Note: Due to a quirk in the block layer open code, each open partition is * only counted once even if there are multiple openers. */ static inline unsigned int disk_openers(struct gendisk *disk) { return atomic_read(&disk->part0->bd_openers); } /** * disk_has_partscan - return %true if partition scanning is enabled on a disk * @disk: disk to check * * Returns %true if partitions scanning is enabled for @disk, or %false if * partition scanning is disabled either permanently or temporarily. */ static inline bool disk_has_partscan(struct gendisk *disk) { return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) && !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state); } /* * The gendisk is refcounted by the part0 block_device, and the bd_device * therein is also used for device model presentation in sysfs. */ #define dev_to_disk(device) \ (dev_to_bdev(device)->bd_disk) #define disk_to_dev(disk) \ (&((disk)->part0->bd_device)) #if IS_REACHABLE(CONFIG_CDROM) #define disk_to_cdi(disk) ((disk)->cdi) #else #define disk_to_cdi(disk) NULL #endif static inline dev_t disk_devt(struct gendisk *disk) { return MKDEV(disk->major, disk->first_minor); } /* blk_validate_limits() validates bsize, so drivers don't usually need to */ static inline int blk_validate_block_size(unsigned long bsize) { if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) return -EINVAL; return 0; } static inline bool blk_op_is_passthrough(blk_opf_t op) { op &= REQ_OP_MASK; return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; } /* flags set by the driver in queue_limits.features */ typedef unsigned int __bitwise blk_features_t; /* supports a volatile write cache */ #define BLK_FEAT_WRITE_CACHE ((__force blk_features_t)(1u << 0)) /* supports passing on the FUA bit */ #define BLK_FEAT_FUA ((__force blk_features_t)(1u << 1)) /* rotational device (hard drive or floppy) */ #define BLK_FEAT_ROTATIONAL ((__force blk_features_t)(1u << 2)) /* contributes to the random number pool */ #define BLK_FEAT_ADD_RANDOM ((__force blk_features_t)(1u << 3)) /* do disk/partitions IO accounting */ #define BLK_FEAT_IO_STAT ((__force blk_features_t)(1u << 4)) /* don't modify data until writeback is done */ #define BLK_FEAT_STABLE_WRITES ((__force blk_features_t)(1u << 5)) /* always completes in submit context */ #define BLK_FEAT_SYNCHRONOUS ((__force blk_features_t)(1u << 6)) /* supports REQ_NOWAIT */ #define BLK_FEAT_NOWAIT ((__force blk_features_t)(1u << 7)) /* supports DAX */ #define BLK_FEAT_DAX ((__force blk_features_t)(1u << 8)) /* supports I/O polling */ #define BLK_FEAT_POLL ((__force blk_features_t)(1u << 9)) /* is a zoned device */ #define BLK_FEAT_ZONED ((__force blk_features_t)(1u << 10)) /* supports PCI(e) p2p requests */ #define BLK_FEAT_PCI_P2PDMA ((__force blk_features_t)(1u << 12)) /* skip this queue in blk_mq_(un)quiesce_tagset */ #define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13)) /* bounce all highmem pages */ #define BLK_FEAT_BOUNCE_HIGH ((__force blk_features_t)(1u << 14)) /* undocumented magic for bcache */ #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) /* stacked device can/does support atomic writes */ #define BLK_FEAT_ATOMIC_WRITES_STACKED \ ((__force blk_features_t)(1u << 16)) /* * Flags automatically inherited when stacking limits. */ #define BLK_FEAT_INHERIT_MASK \ (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH | \ BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) /* internal flags in queue_limits.flags */ typedef unsigned int __bitwise blk_flags_t; /* do not send FLUSH/FUA commands despite advertising a write cache */ #define BLK_FLAG_WRITE_CACHE_DISABLED ((__force blk_flags_t)(1u << 0)) /* I/O topology is misaligned */ #define BLK_FLAG_MISALIGNED ((__force blk_flags_t)(1u << 1)) /* passthrough command IO accounting */ #define BLK_FLAG_IOSTATS_PASSTHROUGH ((__force blk_flags_t)(1u << 2)) struct queue_limits { blk_features_t features; blk_flags_t flags; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; unsigned int max_hw_sectors; unsigned int max_dev_sectors; unsigned int chunk_sectors; unsigned int max_sectors; unsigned int max_user_sectors; unsigned int max_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; unsigned int io_min; unsigned int io_opt; unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; unsigned int max_hw_zone_append_sectors; unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; unsigned int zone_write_granularity; /* atomic write limits */ unsigned int atomic_write_hw_max; unsigned int atomic_write_max_sectors; unsigned int atomic_write_hw_boundary; unsigned int atomic_write_boundary_sectors; unsigned int atomic_write_hw_unit_min; unsigned int atomic_write_unit_min; unsigned int atomic_write_hw_unit_max; unsigned int atomic_write_unit_max; unsigned short max_segments; unsigned short max_integrity_segments; unsigned short max_discard_segments; unsigned int max_open_zones; unsigned int max_active_zones; /* * Drivers that set dma_alignment to less than 511 must be prepared to * handle individual bvec's that are not a multiple of a SECTOR_SIZE * due to possible offsets. */ unsigned int dma_alignment; unsigned int dma_pad_mask; struct blk_integrity integrity; }; typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sectors, sector_t nr_sectors); int blk_revalidate_disk_zones(struct gendisk *disk); /* * Independent access ranges: struct blk_independent_access_range describes * a range of contiguous sectors that can be accessed using device command * execution resources that are independent from the resources used for * other access ranges. This is typically found with single-LUN multi-actuator * HDDs where each access range is served by a different set of heads. * The set of independent ranges supported by the device is defined using * struct blk_independent_access_ranges. The independent ranges must not overlap * and must include all sectors within the disk capacity (no sector holes * allowed). * For a device with multiple ranges, requests targeting sectors in different * ranges can be executed in parallel. A request can straddle an access range * boundary. */ struct blk_independent_access_range { struct kobject kobj; sector_t sector; sector_t nr_sectors; }; struct blk_independent_access_ranges { struct kobject kobj; bool sysfs_registered; unsigned int nr_ia_ranges; struct blk_independent_access_range ia_range[]; }; struct request_queue { /* * The queue owner gets to use this for whatever they like. * ll_rw_blk doesn't touch it. */ void *queuedata; struct elevator_queue *elevator; const struct blk_mq_ops *mq_ops; /* sw queues */ struct blk_mq_ctx __percpu *queue_ctx; /* * various queue flags, see QUEUE_* below */ unsigned long queue_flags; unsigned int rq_timeout; unsigned int queue_depth; refcount_t refs; /* hw dispatch queues */ unsigned int nr_hw_queues; struct xarray hctx_table; struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; struct lockdep_map io_lockdep_map; struct lock_class_key q_lock_cls_key; struct lockdep_map q_lockdep_map; struct request *last_merge; spinlock_t queue_lock; int quiesce_depth; struct gendisk *disk; /* * mq queue kobject */ struct kobject *mq_kobj; struct queue_limits limits; #ifdef CONFIG_PM struct device *dev; enum rpm_status rpm_status; #endif /* * Number of contexts that have called blk_set_pm_only(). If this * counter is above zero then only RQF_PM requests are processed. */ atomic_t pm_only; struct blk_queue_stats *stats; struct rq_qos *rq_qos; struct mutex rq_qos_mutex; /* * ida allocated id for this queue. Used to index queues from * ioctx. */ int id; /* * queue settings */ unsigned long nr_requests; /* Max # of requests */ #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct blk_crypto_profile *crypto_profile; struct kobject *crypto_kobject; #endif struct timer_list timeout; struct work_struct timeout_work; atomic_t nr_active_requests_shared_tags; struct blk_mq_tags *sched_shared_tags; struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); struct blkcg_gq *root_blkg; struct list_head blkg_list; struct mutex blkcg_mutex; #endif int node; spinlock_t requeue_lock; struct list_head requeue_list; struct delayed_work requeue_work; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace __rcu *blk_trace; #endif /* * for flush operations */ struct blk_flush_queue *fq; struct list_head flush_list; struct mutex sysfs_lock; struct mutex sysfs_dir_lock; struct mutex limits_lock; /* * for reusing dead hctx instance in case of updating * nr_hw_queues */ struct list_head unused_hctx_list; spinlock_t unused_hctx_lock; int mq_freeze_depth; #ifdef CONFIG_BLK_DEV_THROTTLING /* Throttle data */ struct throtl_data *td; #endif struct rcu_head rcu_head; #ifdef CONFIG_LOCKDEP struct task_struct *mq_freeze_owner; int mq_freeze_owner_depth; #endif wait_queue_head_t mq_freeze_wq; /* * Protect concurrent access to q_usage_counter by * percpu_ref_kill() and percpu_ref_reinit(). */ struct mutex mq_freeze_lock; struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; struct dentry *debugfs_dir; struct dentry *sched_debugfs_dir; struct dentry *rqos_debugfs_dir; /* * Serializes all debugfs metadata operations using the above dentries. */ struct mutex debugfs_mutex; bool mq_sysfs_init_done; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ enum { QUEUE_FLAG_DYING, /* queue being torn down */ QUEUE_FLAG_NOMERGES, /* disable merge attempts */ QUEUE_FLAG_SAME_COMP, /* complete on same CPU-group */ QUEUE_FLAG_FAIL_IO, /* fake timeout */ QUEUE_FLAG_NOXMERGES, /* No extended merges */ QUEUE_FLAG_SAME_FORCE, /* force complete on same CPU */ QUEUE_FLAG_INIT_DONE, /* queue is initialized */ QUEUE_FLAG_STATS, /* track IO start and completion times */ QUEUE_FLAG_REGISTERED, /* queue has been registered to a disk */ QUEUE_FLAG_QUIESCED, /* queue has been quiesced */ QUEUE_FLAG_RQ_ALLOC_TIME, /* record rq->alloc_time_ns */ QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */ QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */ QUEUE_FLAG_MAX }; #define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_passthrough_stat(q) \ ((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH) #define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) #define blk_queue_pci_p2pdma(q) ((q)->limits.features & BLK_FEAT_PCI_P2PDMA) #ifdef CONFIG_BLK_RQ_ALLOC_TIME #define blk_queue_rq_alloc_time(q) \ test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags) #else #define blk_queue_rq_alloc_time(q) false #endif #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ REQ_FAILFAST_DRIVER)) #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) #define blk_queue_skip_tagset_quiesce(q) \ ((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) #define dma_map_bvec(dev, bv, dir, attrs) \ dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \ (dir), (attrs)) static inline bool queue_is_mq(struct request_queue *q) { return q->mq_ops; } #ifdef CONFIG_PM static inline enum rpm_status queue_rpm_status(struct request_queue *q) { return q->rpm_status; } #else static inline enum rpm_status queue_rpm_status(struct request_queue *q) { return RPM_ACTIVE; } #endif static inline bool blk_queue_is_zoned(struct request_queue *q) { return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (q->limits.features & BLK_FEAT_ZONED); } #ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int disk_nr_zones(struct gendisk *disk) { return disk->nr_zones; } bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; } static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) { return false; } #endif /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) { if (!blk_queue_is_zoned(disk->queue)) return 0; return sector >> ilog2(disk->queue->limits.chunk_sectors); } static inline unsigned int bdev_nr_zones(struct block_device *bdev) { return disk_nr_zones(bdev->bd_disk); } static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { return bdev->bd_disk->queue->limits.max_open_zones; } static inline unsigned int bdev_max_active_zones(struct block_device *bdev) { return bdev->bd_disk->queue->limits.max_active_zones; } static inline unsigned int blk_queue_depth(struct request_queue *q) { if (q->queue_depth) return q->queue_depth; return q->nr_requests; } /* * default timeout for SG_IO if none specified */ #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ) #define BLK_MIN_SG_TIMEOUT (7 * HZ) /* This should not be used directly - use rq_for_each_segment */ #define for_each_bio(_bio) \ for (; _bio; _bio = _bio->bi_next) int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, const struct attribute_group **groups, struct fwnode_handle *fwnode); int __must_check device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups); static inline int __must_check add_disk(struct gendisk *disk) { return device_add_disk(NULL, disk, NULL); } void del_gendisk(struct gendisk *gp); void invalidate_disk(struct gendisk *disk); void set_disk_ro(struct gendisk *disk, bool read_only); void disk_uevent(struct gendisk *disk, enum kobject_action action); static inline u8 bdev_partno(const struct block_device *bdev) { return atomic_read(&bdev->__bd_flags) & BD_PARTNO; } static inline bool bdev_test_flag(const struct block_device *bdev, unsigned flag) { return atomic_read(&bdev->__bd_flags) & flag; } static inline void bdev_set_flag(struct block_device *bdev, unsigned flag) { atomic_or(flag, &bdev->__bd_flags); } static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag) { atomic_andnot(flag, &bdev->__bd_flags); } static inline bool get_disk_ro(struct gendisk *disk) { return bdev_test_flag(disk->part0, BD_READ_ONLY) || test_bit(GD_READ_ONLY, &disk->state); } static inline bool bdev_read_only(struct block_device *bdev) { return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk); } bool set_capacity_and_notify(struct gendisk *disk, sector_t size); void disk_force_media_change(struct gendisk *disk); void bdev_mark_dead(struct block_device *bdev, bool surprise); void add_disk_randomness(struct gendisk *disk) __latent_entropy; void rand_initialize_disk(struct gendisk *disk); static inline sector_t get_start_sect(struct block_device *bdev) { return bdev->bd_start_sect; } static inline sector_t bdev_nr_sectors(struct block_device *bdev) { return bdev->bd_nr_sectors; } static inline loff_t bdev_nr_bytes(struct block_device *bdev) { return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT; } static inline sector_t get_capacity(struct gendisk *disk) { return bdev_nr_sectors(disk->part0); } static inline u64 sb_bdev_nr_blocks(struct super_block *sb) { return bdev_nr_sectors(sb->s_bdev) >> (sb->s_blocksize_bits - SECTOR_SHIFT); } int bdev_disk_changed(struct gendisk *disk, bool invalidate); void put_disk(struct gendisk *disk); struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, struct lock_class_key *lkclass); /** * blk_alloc_disk - allocate a gendisk structure * @lim: queue limits to be used for this disk. * @node_id: numa node to allocate on * * Allocate and pre-initialize a gendisk structure for use with BIO based * drivers. * * Returns an ERR_PTR on error, else the allocated disk. * * Context: can sleep */ #define blk_alloc_disk(lim, node_id) \ ({ \ static struct lock_class_key __key; \ \ __blk_alloc_disk(lim, node_id, &__key); \ }) int __register_blkdev(unsigned int major, const char *name, void (*probe)(dev_t devt)); #define register_blkdev(major, name) \ __register_blkdev(major, name, NULL) void unregister_blkdev(unsigned int major, const char *name); bool disk_check_media_change(struct gendisk *disk); void set_capacity(struct gendisk *disk, sector_t size); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { return 0; } static inline void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { } #endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ dev_t part_devt(struct gendisk *disk, u8 partno); void inc_diskseq(struct gendisk *disk); void blk_request_module(dev_t devt); extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); void submit_bio_noacct(struct bio *bio); struct bio *bio_split_to_limits(struct bio *bio); extern int blk_lld_busy(struct request_queue *q); extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); /* Helper to convert REQ_OP_XXX to its string format XXX */ extern const char *blk_op_str(enum req_op op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); const char *blk_status_to_str(blk_status_t status); /* only poll the hardware once, don't continue until a completion was found */ #define BLK_POLL_ONESHOT (1 << 0) int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { return bdev->bd_queue; /* this is never NULL */ } /* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond); static inline unsigned int bio_zone_no(struct bio *bio) { return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); } static inline bool bio_straddles_zones(struct bio *bio) { return bio_sectors(bio) && bio_zone_no(bio) != disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1); } /* * Return how much within the boundary is left to be used for I/O at a given * offset. */ static inline unsigned int blk_boundary_sectors_left(sector_t offset, unsigned int boundary_sectors) { if (unlikely(!is_power_of_2(boundary_sectors))) return boundary_sectors - sector_div(offset, boundary_sectors); return boundary_sectors - (offset & (boundary_sectors - 1)); } /** * queue_limits_start_update - start an atomic update of queue limits * @q: queue to update * * This functions starts an atomic update of the queue limits. It takes a lock * to prevent other updates and returns a snapshot of the current limits that * the caller can modify. The caller must call queue_limits_commit_update() * to finish the update. * * Context: process context. The caller must have frozen the queue or ensured * that there is outstanding I/O by other means. */ static inline struct queue_limits queue_limits_start_update(struct request_queue *q) { mutex_lock(&q->limits_lock); return q->limits; } int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim); int queue_limits_set(struct request_queue *q, struct queue_limits *lim); int blk_validate_limits(struct queue_limits *lim); /** * queue_limits_cancel_update - cancel an atomic update of queue limits * @q: queue to update * * This functions cancels an atomic update of the queue limits started by * queue_limits_start_update() and should be used when an error occurs after * starting update. */ static inline void queue_limits_cancel_update(struct request_queue *q) { mutex_unlock(&q->limits_lock); } /* * These helpers are for drivers that have sloppy feature negotiation and might * have to disable DISCARD, WRITE_ZEROES or SECURE_DISCARD from the I/O * completion handler when the device returned an indicator that the respective * feature is not actually supported. They are racy and the driver needs to * cope with that. Try to avoid this scheme if you can. */ static inline void blk_queue_disable_discard(struct request_queue *q) { q->limits.max_discard_sectors = 0; } static inline void blk_queue_disable_secure_erase(struct request_queue *q) { q->limits.max_secure_erase_sectors = 0; } static inline void blk_queue_disable_write_zeroes(struct request_queue *q) { q->limits.max_write_zeroes_sectors = 0; } /* * Access functions for manipulating queue properties */ extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); struct blk_independent_access_ranges * disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); void disk_set_independent_access_ranges(struct gendisk *disk, struct blk_independent_access_ranges *iars); bool __must_check blk_get_queue(struct request_queue *); extern void blk_put_queue(struct request_queue *); void blk_mark_disk_dead(struct gendisk *disk); struct rq_list { struct request *head; struct request *tail; }; #ifdef CONFIG_BLOCK /* * blk_plug permits building a queue of related requests by holding the I/O * fragments for a short period. This allows merging of sequential requests * into single larger request. As the requests are moved from a per-task list to * the device's request_queue in a batch, this results in improved scalability * as the lock contention for request_queue lock is reduced. * * It is ok not to disable preemption when adding the request to the plug list * or when attempting a merge. For details, please see schedule() where * blk_flush_plug() is called. */ struct blk_plug { struct rq_list mq_list; /* blk-mq requests */ /* if ios_left is > 1, we can batch tag/rq allocations */ struct rq_list cached_rqs; u64 cur_ktime; unsigned short nr_ios; unsigned short rq_count; bool multiple_queues; bool has_elevator; struct list_head cb_list; /* md requires an unplug callback */ }; struct blk_plug_cb; typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool); struct blk_plug_cb { struct list_head list; blk_plug_cb_fn callback; void *data; }; extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, int size); extern void blk_start_plug(struct blk_plug *); extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short); extern void blk_finish_plug(struct blk_plug *); void __blk_flush_plug(struct blk_plug *plug, bool from_schedule); static inline void blk_flush_plug(struct blk_plug *plug, bool async) { if (plug) __blk_flush_plug(plug, async); } /* * tsk == current here */ static inline void blk_plug_invalidate_ts(struct task_struct *tsk) { struct blk_plug *plug = tsk->plug; if (plug) plug->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; } int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ struct blk_plug { }; static inline void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) { } static inline void blk_start_plug(struct blk_plug *plug) { } static inline void blk_finish_plug(struct blk_plug *plug) { } static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } static inline void blk_plug_invalidate_ts(struct task_struct *tsk) { } static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; } static inline long nr_blockdev_pages(void) { return 0; } #endif /* CONFIG_BLOCK */ extern void blk_io_schedule(void); int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask); int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop); int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp); #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ #define BLKDEV_ZERO_KILLABLE (1 << 2) /* interruptible by fatal signals */ extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned flags); static inline int sb_issue_discard(struct super_block *sb, sector_t block, sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags) { return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - SECTOR_SHIFT), nr_blocks << (sb->s_blocksize_bits - SECTOR_SHIFT), gfp_mask); } static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, sector_t nr_blocks, gfp_t gfp_mask) { return blkdev_issue_zeroout(sb->s_bdev, block << (sb->s_blocksize_bits - SECTOR_SHIFT), nr_blocks << (sb->s_blocksize_bits - SECTOR_SHIFT), gfp_mask, 0); } static inline bool bdev_is_partition(struct block_device *bdev) { return bdev_partno(bdev) != 0; } enum blk_default_limits { BLK_MAX_SEGMENTS = 128, BLK_SAFE_MAX_SECTORS = 255, BLK_MAX_SEGMENT_SIZE = 65536, BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; /* * Default upper limit for the software max_sectors limit used for * regular file system I/O. This can be increased through sysfs. * * Not to be confused with the max_hw_sector limit that is entirely * controlled by the driver, usually based on hardware limits. */ #define BLK_DEF_MAX_SECTORS_CAP 2560u static inline struct queue_limits *bdev_limits(struct block_device *bdev) { return &bdev_get_queue(bdev)->limits; } static inline unsigned long queue_segment_boundary(const struct request_queue *q) { return q->limits.seg_boundary_mask; } static inline unsigned long queue_virt_boundary(const struct request_queue *q) { return q->limits.virt_boundary_mask; } static inline unsigned int queue_max_sectors(const struct request_queue *q) { return q->limits.max_sectors; } static inline unsigned int queue_max_bytes(struct request_queue *q) { return min_t(unsigned int, queue_max_sectors(q), INT_MAX >> 9) << 9; } static inline unsigned int queue_max_hw_sectors(const struct request_queue *q) { return q->limits.max_hw_sectors; } static inline unsigned short queue_max_segments(const struct request_queue *q) { return q->limits.max_segments; } static inline unsigned short queue_max_discard_segments(const struct request_queue *q) { return q->limits.max_discard_segments; } static inline unsigned int queue_max_segment_size(const struct request_queue *q) { return q->limits.max_segment_size; } static inline bool queue_emulates_zone_append(struct request_queue *q) { return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors; } static inline bool bdev_emulates_zone_append(struct block_device *bdev) { return queue_emulates_zone_append(bdev_get_queue(bdev)); } static inline unsigned int bdev_max_zone_append_sectors(struct block_device *bdev) { return bdev_limits(bdev)->max_zone_append_sectors; } static inline unsigned int bdev_max_segments(struct block_device *bdev) { return queue_max_segments(bdev_get_queue(bdev)); } static inline unsigned queue_logical_block_size(const struct request_queue *q) { return q->limits.logical_block_size; } static inline unsigned int bdev_logical_block_size(struct block_device *bdev) { return queue_logical_block_size(bdev_get_queue(bdev)); } static inline unsigned int queue_physical_block_size(const struct request_queue *q) { return q->limits.physical_block_size; } static inline unsigned int bdev_physical_block_size(struct block_device *bdev) { return queue_physical_block_size(bdev_get_queue(bdev)); } static inline unsigned int queue_io_min(const struct request_queue *q) { return q->limits.io_min; } static inline unsigned int bdev_io_min(struct block_device *bdev) { return queue_io_min(bdev_get_queue(bdev)); } static inline unsigned int queue_io_opt(const struct request_queue *q) { return q->limits.io_opt; } static inline unsigned int bdev_io_opt(struct block_device *bdev) { return queue_io_opt(bdev_get_queue(bdev)); } static inline unsigned int queue_zone_write_granularity(const struct request_queue *q) { return q->limits.zone_write_granularity; } static inline unsigned int bdev_zone_write_granularity(struct block_device *bdev) { return queue_zone_write_granularity(bdev_get_queue(bdev)); } int bdev_alignment_offset(struct block_device *bdev); unsigned int bdev_discard_alignment(struct block_device *bdev); static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev) { return bdev_limits(bdev)->max_discard_sectors; } static inline unsigned int bdev_discard_granularity(struct block_device *bdev) { return bdev_limits(bdev)->discard_granularity; } static inline unsigned int bdev_max_secure_erase_sectors(struct block_device *bdev) { return bdev_limits(bdev)->max_secure_erase_sectors; } static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { return bdev_limits(bdev)->max_write_zeroes_sectors; } static inline bool bdev_nonrot(struct block_device *bdev) { return blk_queue_nonrot(bdev_get_queue(bdev)); } static inline bool bdev_synchronous(struct block_device *bdev) { return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; } static inline bool bdev_stable_writes(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && q->limits.integrity.csum_type != BLK_INTEGRITY_CSUM_NONE) return true; return q->limits.features & BLK_FEAT_STABLE_WRITES; } static inline bool blk_queue_write_cache(struct request_queue *q) { return (q->limits.features & BLK_FEAT_WRITE_CACHE) && !(q->limits.flags & BLK_FLAG_WRITE_CACHE_DISABLED); } static inline bool bdev_write_cache(struct block_device *bdev) { return blk_queue_write_cache(bdev_get_queue(bdev)); } static inline bool bdev_fua(struct block_device *bdev) { return bdev_limits(bdev)->features & BLK_FEAT_FUA; } static inline bool bdev_nowait(struct block_device *bdev) { return bdev->bd_disk->queue->limits.features & BLK_FEAT_NOWAIT; } static inline bool bdev_is_zoned(struct block_device *bdev) { return blk_queue_is_zoned(bdev_get_queue(bdev)); } static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec) { return disk_zone_no(bdev->bd_disk, sec); } static inline sector_t bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (!blk_queue_is_zoned(q)) return 0; return q->limits.chunk_sectors; } static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev, sector_t sector) { return sector & (bdev_zone_sectors(bdev) - 1); } static inline sector_t bio_offset_from_zone_start(struct bio *bio) { return bdev_offset_from_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector); } static inline bool bdev_is_zone_start(struct block_device *bdev, sector_t sector) { return bdev_offset_from_zone_start(bdev, sector) == 0; } /** * bdev_zone_is_seq - check if a sector belongs to a sequential write zone * @bdev: block device to check * @sector: sector number * * Check if @sector on @bdev is contained in a sequential write required zone. */ static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) { bool is_seq = false; #if IS_ENABLED(CONFIG_BLK_DEV_ZONED) if (bdev_is_zoned(bdev)) { struct gendisk *disk = bdev->bd_disk; unsigned long *bitmap; rcu_read_lock(); bitmap = rcu_dereference(disk->conv_zones_bitmap); is_seq = !bitmap || !test_bit(disk_zone_no(disk, sector), bitmap); rcu_read_unlock(); } #endif return is_seq; } int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask); static inline unsigned int queue_dma_alignment(const struct request_queue *q) { return q->limits.dma_alignment; } static inline unsigned int queue_atomic_write_unit_max_bytes(const struct request_queue *q) { return q->limits.atomic_write_unit_max; } static inline unsigned int queue_atomic_write_unit_min_bytes(const struct request_queue *q) { return q->limits.atomic_write_unit_min; } static inline unsigned int queue_atomic_write_boundary_bytes(const struct request_queue *q) { return q->limits.atomic_write_boundary_sectors << SECTOR_SHIFT; } static inline unsigned int queue_atomic_write_max_bytes(const struct request_queue *q) { return q->limits.atomic_write_max_sectors << SECTOR_SHIFT; } static inline unsigned int bdev_dma_alignment(struct block_device *bdev) { return queue_dma_alignment(bdev_get_queue(bdev)); } static inline bool bdev_iter_is_aligned(struct block_device *bdev, struct iov_iter *iter) { return iov_iter_is_aligned(iter, bdev_dma_alignment(bdev), bdev_logical_block_size(bdev) - 1); } static inline unsigned int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) { return lim->dma_alignment | lim->dma_pad_mask; } static inline bool blk_rq_aligned(struct request_queue *q, unsigned long addr, unsigned int len) { unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits); return !(addr & alignment) && !(len & alignment); } /* assumes size > 256 */ static inline unsigned int blksize_bits(unsigned int size) { return order_base_2(size >> SECTOR_SHIFT) + SECTOR_SHIFT; } int kblockd_schedule_work(struct work_struct *work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); #define MODULE_ALIAS_BLOCKDEV(major,minor) \ MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ MODULE_ALIAS("block-major-" __stringify(major) "-*") #ifdef CONFIG_BLK_INLINE_ENCRYPTION bool blk_crypto_register(struct blk_crypto_profile *profile, struct request_queue *q); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool blk_crypto_register(struct blk_crypto_profile *profile, struct request_queue *q) { return true; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ enum blk_unique_id { /* these match the Designator Types specified in SPC */ BLK_UID_T10 = 1, BLK_UID_EUI64 = 2, BLK_UID_NAA = 3, }; struct block_device_operations { void (*submit_bio)(struct bio *bio); int (*poll_bio)(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); int (*open)(struct gendisk *disk, blk_mode_t mode); void (*release)(struct gendisk *disk); int (*ioctl)(struct block_device *bdev, blk_mode_t mode, unsigned cmd, unsigned long arg); int (*compat_ioctl)(struct block_device *bdev, blk_mode_t mode, unsigned cmd, unsigned long arg); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); void (*unlock_native_capacity) (struct gendisk *); int (*getgeo)(struct block_device *, struct hd_geometry *); int (*set_read_only)(struct block_device *bdev, bool ro); void (*free_disk)(struct gendisk *disk); /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); char *(*devnode)(struct gendisk *disk, umode_t *mode); /* returns the length of the identifier or a negative errno: */ int (*get_unique_id)(struct gendisk *disk, u8 id[16], enum blk_unique_id id_type); struct module *owner; const struct pr_ops *pr_ops; /* * Special callback for probing GPT entry at a given sector. * Needed by Android devices, used by GPT scanner and MMC blk * driver. */ int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector); }; #ifdef CONFIG_COMPAT extern int blkdev_compat_ptr_ioctl(struct block_device *, blk_mode_t, unsigned int, unsigned long); #else #define blkdev_compat_ptr_ioctl NULL #endif static inline void blk_wake_io_task(struct task_struct *waiter) { /* * If we're polling, the task itself is doing the completions. For * that case, we don't need to signal a wakeup, it's enough to just * mark us as RUNNING. */ if (waiter == current) __set_current_state(TASK_RUNNING); else wake_up_process(waiter); } unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time); void bdev_end_io_acct(struct block_device *bdev, enum req_op op, unsigned int sectors, unsigned long start_time); unsigned long bio_start_io_acct(struct bio *bio); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, struct block_device *orig_bdev); /** * bio_end_io_acct - end I/O accounting for bio based drivers * @bio: bio to end account for * @start_time: start time returned by bio_start_io_acct() */ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) { return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); } int set_blocksize(struct file *file, int size); int lookup_bdev(const char *pathname, dev_t *dev); void blkdev_show(struct seq_file *seqf, off_t offset); #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ #define BDEVT_SIZE 10 /* Largest string for MAJ:MIN for blkdev */ #ifdef CONFIG_BLOCK #define BLKDEV_MAJOR_MAX 512 #else #define BLKDEV_MAJOR_MAX 0 #endif struct blk_holder_ops { void (*mark_dead)(struct block_device *bdev, bool surprise); /* * Sync the file system mounted on the block device. */ void (*sync)(struct block_device *bdev); /* * Freeze the file system mounted on the block device. */ int (*freeze)(struct block_device *bdev); /* * Thaw the file system mounted on the block device. */ int (*thaw)(struct block_device *bdev); }; /* * For filesystems using @fs_holder_ops, the @holder argument passed to * helpers used to open and claim block devices via * bd_prepare_to_claim() must point to a superblock. */ extern const struct blk_holder_ops fs_holder_ops; /* * Return the correct open flags for blkdev_get_by_* for super block flags * as stored in sb->s_flags. */ #define sb_open_mode(flags) \ (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \ (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops); void bd_abort_claiming(struct block_device *bdev, void *holder); /* just for blk-cgroup, don't use elsewhere */ struct block_device *blkdev_get_no_open(dev_t dev); void blkdev_put_no_open(struct block_device *bdev); struct block_device *I_BDEV(struct inode *inode); struct block_device *file_bdev(struct file *bdev_file); bool disk_live(struct gendisk *disk); unsigned int block_size(struct block_device *bdev); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); void bdev_statx(struct path *, struct kstat *, u32); void printk_all_partitions(void); int __init early_lookup_bdev(const char *pathname, dev_t *dev); #else static inline void invalidate_bdev(struct block_device *bdev) { } static inline int sync_blockdev(struct block_device *bdev) { return 0; } static inline int sync_blockdev_nowait(struct block_device *bdev) { return 0; } static inline void sync_bdevs(bool wait) { } static inline void bdev_statx(struct path *path, struct kstat *stat, u32 request_mask) { } static inline void printk_all_partitions(void) { } static inline int early_lookup_bdev(const char *pathname, dev_t *dev) { return -EINVAL; } #endif /* CONFIG_BLOCK */ int bdev_freeze(struct block_device *bdev); int bdev_thaw(struct block_device *bdev); void bdev_fput(struct file *bdev_file); struct io_comp_batch { struct rq_list req_list; bool need_ts; void (*complete)(struct io_comp_batch *); }; static inline bool bdev_can_atomic_write(struct block_device *bdev) { struct request_queue *bd_queue = bdev->bd_queue; struct queue_limits *limits = &bd_queue->limits; if (!limits->atomic_write_unit_min) return false; if (bdev_is_partition(bdev)) { sector_t bd_start_sect = bdev->bd_start_sect; unsigned int alignment = max(limits->atomic_write_unit_min, limits->atomic_write_hw_boundary); if (!IS_ALIGNED(bd_start_sect, alignment >> SECTOR_SHIFT)) return false; } return true; } static inline unsigned int bdev_atomic_write_unit_min_bytes(struct block_device *bdev) { if (!bdev_can_atomic_write(bdev)) return 0; return queue_atomic_write_unit_min_bytes(bdev_get_queue(bdev)); } static inline unsigned int bdev_atomic_write_unit_max_bytes(struct block_device *bdev) { if (!bdev_can_atomic_write(bdev)) return 0; return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev)); } #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */
3 1 3 4 3 1 4 10 2 10 2 34 8 14 14 3 13 9 8 8 7 1 8 2 2 2 2 7 6 7 5 3 2 7 7 5 2 3 4 22 22 7 6 2 12 8 4 2 3 1 1 6 8 13 4 4 4 4 2 2 16 1 16 14 10 5 2 6 3 2 4 2 1 1 2 2 11 4 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2023 Isovalent */ #include <linux/bpf.h> #include <linux/bpf_mprog.h> static int bpf_mprog_link(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { struct bpf_link *link = ERR_PTR(-EINVAL); bool id = flags & BPF_F_ID; if (id) link = bpf_link_by_id(id_or_fd); else if (id_or_fd) link = bpf_link_get_from_fd(id_or_fd); if (IS_ERR(link)) return PTR_ERR(link); if (type && link->prog->type != type) { bpf_link_put(link); return -EINVAL; } tuple->link = link; tuple->prog = link->prog; return 0; } static int bpf_mprog_prog(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { struct bpf_prog *prog = ERR_PTR(-EINVAL); bool id = flags & BPF_F_ID; if (id) prog = bpf_prog_by_id(id_or_fd); else if (id_or_fd) prog = bpf_prog_get(id_or_fd); if (IS_ERR(prog)) return PTR_ERR(prog); if (type && prog->type != type) { bpf_prog_put(prog); return -EINVAL; } tuple->link = NULL; tuple->prog = prog; return 0; } static int bpf_mprog_tuple_relative(struct bpf_tuple *tuple, u32 id_or_fd, u32 flags, enum bpf_prog_type type) { bool link = flags & BPF_F_LINK; bool id = flags & BPF_F_ID; memset(tuple, 0, sizeof(*tuple)); if (link) return bpf_mprog_link(tuple, id_or_fd, flags, type); /* If no relevant flag is set and no id_or_fd was passed, then * tuple link/prog is just NULLed. This is the case when before/ * after selects first/last position without passing fd. */ if (!id && !id_or_fd) return 0; return bpf_mprog_prog(tuple, id_or_fd, flags, type); } static void bpf_mprog_tuple_put(struct bpf_tuple *tuple) { if (tuple->link) bpf_link_put(tuple->link); else if (tuple->prog) bpf_prog_put(tuple->prog); } /* The bpf_mprog_{replace,delete}() operate on exact idx position with the * one exception that for deletion we support delete from front/back. In * case of front idx is -1, in case of back idx is bpf_mprog_total(entry). * Adjustment to first and last entry is trivial. The bpf_mprog_insert() * we have to deal with the following cases: * * idx + before: * * Insert P4 before P3: idx for old array is 1, idx for new array is 2, * hence we adjust target idx for the new array, so that memmove copies * P1 and P2 to the new entry, and we insert P4 into idx 2. Inserting * before P1 would have old idx -1 and new idx 0. * * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * * idx + after: * * Insert P4 after P2: idx for old array is 2, idx for new array is 2. * Again, memmove copies P1 and P2 to the new entry, and we insert P4 * into idx 2. Inserting after P3 would have both old/new idx at 4 aka * bpf_mprog_total(entry). * * +--+--+--+ +--+--+--+--+ +--+--+--+--+ * |P1|P2|P3| ==> |P1|P2| |P3| ==> |P1|P2|P4|P3| * +--+--+--+ +--+--+--+--+ +--+--+--+--+ */ static int bpf_mprog_replace(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *ntuple, int idx) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; struct bpf_prog *oprog; bpf_mprog_read(entry, idx, &fp, &cp); oprog = READ_ONCE(fp->prog); bpf_mprog_write(fp, cp, ntuple); if (!ntuple->link) { WARN_ON_ONCE(cp->link); bpf_prog_put(oprog); } *entry_new = entry; return 0; } static int bpf_mprog_insert(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *ntuple, int idx, u32 flags) { int total = bpf_mprog_total(entry); struct bpf_mprog_entry *peer; struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; peer = bpf_mprog_peer(entry); bpf_mprog_entry_copy(peer, entry); if (idx == total) goto insert; else if (flags & BPF_F_BEFORE) idx += 1; bpf_mprog_entry_grow(peer, idx); insert: bpf_mprog_read(peer, idx, &fp, &cp); bpf_mprog_write(fp, cp, ntuple); bpf_mprog_inc(peer); *entry_new = peer; return 0; } static int bpf_mprog_delete(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_tuple *dtuple, int idx) { int total = bpf_mprog_total(entry); struct bpf_mprog_entry *peer; peer = bpf_mprog_peer(entry); bpf_mprog_entry_copy(peer, entry); if (idx == -1) idx = 0; else if (idx == total) idx = total - 1; bpf_mprog_entry_shrink(peer, idx); bpf_mprog_dec(peer); bpf_mprog_mark_for_release(peer, dtuple); *entry_new = peer; return 0; } /* In bpf_mprog_pos_*() we evaluate the target position for the BPF * program/link that needs to be replaced, inserted or deleted for * each "rule" independently. If all rules agree on that position * or existing element, then enact replacement, addition or deletion. * If this is not the case, then the request cannot be satisfied and * we bail out with an error. */ static int bpf_mprog_pos_exact(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog)) return tuple->link == cp->link ? i : -EBUSY; } return -ENOENT; } static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog) && (!tuple->link || tuple->link == cp->link)) return i - 1; } return tuple->prog ? -ENOENT : -1; } static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple) { struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; int i; for (i = 0; i < bpf_mprog_total(entry); i++) { bpf_mprog_read(entry, i, &fp, &cp); if (tuple->prog == READ_ONCE(fp->prog) && (!tuple->link || tuple->link == cp->link)) return i + 1; } return tuple->prog ? -ENOENT : bpf_mprog_total(entry); } int bpf_mprog_attach(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_prog *prog_new, struct bpf_link *link, struct bpf_prog *prog_old, u32 flags, u32 id_or_fd, u64 revision) { struct bpf_tuple rtuple, ntuple = { .prog = prog_new, .link = link, }, otuple = { .prog = prog_old, .link = link, }; int ret, idx = -ERANGE, tidx; if (revision && revision != bpf_mprog_revision(entry)) return -ESTALE; if (bpf_mprog_exists(entry, prog_new)) return -EEXIST; ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags & ~BPF_F_REPLACE, prog_new->type); if (ret) return ret; if (flags & BPF_F_REPLACE) { tidx = bpf_mprog_pos_exact(entry, &otuple); if (tidx < 0) { ret = tidx; goto out; } idx = tidx; } else if (bpf_mprog_total(entry) == bpf_mprog_max()) { ret = -ERANGE; goto out; } if (flags & BPF_F_BEFORE) { tidx = bpf_mprog_pos_before(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < -1 ? tidx : -ERANGE; goto out; } idx = tidx; } if (flags & BPF_F_AFTER) { tidx = bpf_mprog_pos_after(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < 0 ? tidx : -ERANGE; goto out; } idx = tidx; } if (idx < -1) { if (rtuple.prog || flags) { ret = -EINVAL; goto out; } idx = bpf_mprog_total(entry); flags = BPF_F_AFTER; } if (idx >= bpf_mprog_max()) { ret = -ERANGE; goto out; } if (flags & BPF_F_REPLACE) ret = bpf_mprog_replace(entry, entry_new, &ntuple, idx); else ret = bpf_mprog_insert(entry, entry_new, &ntuple, idx, flags); out: bpf_mprog_tuple_put(&rtuple); return ret; } static int bpf_mprog_fetch(struct bpf_mprog_entry *entry, struct bpf_tuple *tuple, int idx) { int total = bpf_mprog_total(entry); struct bpf_mprog_cp *cp; struct bpf_mprog_fp *fp; struct bpf_prog *prog; struct bpf_link *link; if (idx == -1) idx = 0; else if (idx == total) idx = total - 1; bpf_mprog_read(entry, idx, &fp, &cp); prog = READ_ONCE(fp->prog); link = cp->link; /* The deletion request can either be without filled tuple in which * case it gets populated here based on idx, or with filled tuple * where the only thing we end up doing is the WARN_ON_ONCE() assert. * If we hit a BPF link at the given index, it must not be removed * from opts path. */ if (link && !tuple->link) return -EBUSY; WARN_ON_ONCE(tuple->prog && tuple->prog != prog); WARN_ON_ONCE(tuple->link && tuple->link != link); tuple->prog = prog; tuple->link = link; return 0; } int bpf_mprog_detach(struct bpf_mprog_entry *entry, struct bpf_mprog_entry **entry_new, struct bpf_prog *prog, struct bpf_link *link, u32 flags, u32 id_or_fd, u64 revision) { struct bpf_tuple rtuple, dtuple = { .prog = prog, .link = link, }; int ret, idx = -ERANGE, tidx; if (flags & BPF_F_REPLACE) return -EINVAL; if (revision && revision != bpf_mprog_revision(entry)) return -ESTALE; if (!bpf_mprog_total(entry)) return -ENOENT; ret = bpf_mprog_tuple_relative(&rtuple, id_or_fd, flags, prog ? prog->type : BPF_PROG_TYPE_UNSPEC); if (ret) return ret; if (dtuple.prog) { tidx = bpf_mprog_pos_exact(entry, &dtuple); if (tidx < 0) { ret = tidx; goto out; } idx = tidx; } if (flags & BPF_F_BEFORE) { tidx = bpf_mprog_pos_before(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < -1 ? tidx : -ERANGE; goto out; } idx = tidx; } if (flags & BPF_F_AFTER) { tidx = bpf_mprog_pos_after(entry, &rtuple); if (tidx < -1 || (idx >= -1 && tidx != idx)) { ret = tidx < 0 ? tidx : -ERANGE; goto out; } idx = tidx; } if (idx < -1) { if (rtuple.prog || flags) { ret = -EINVAL; goto out; } idx = bpf_mprog_total(entry); flags = BPF_F_AFTER; } if (idx >= bpf_mprog_max()) { ret = -ERANGE; goto out; } ret = bpf_mprog_fetch(entry, &dtuple, idx); if (ret) goto out; ret = bpf_mprog_delete(entry, entry_new, &dtuple, idx); out: bpf_mprog_tuple_put(&rtuple); return ret; } int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct bpf_mprog_entry *entry) { u32 __user *uprog_flags, *ulink_flags; u32 __user *uprog_id, *ulink_id; struct bpf_mprog_fp *fp; struct bpf_mprog_cp *cp; struct bpf_prog *prog; const u32 flags = 0; u32 id, count = 0; u64 revision = 1; int i, ret = 0; if (attr->query.query_flags || attr->query.attach_flags) return -EINVAL; if (entry) { revision = bpf_mprog_revision(entry); count = bpf_mprog_total(entry); } if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) return -EFAULT; if (copy_to_user(&uattr->query.count, &count, sizeof(count))) return -EFAULT; uprog_id = u64_to_user_ptr(attr->query.prog_ids); uprog_flags = u64_to_user_ptr(attr->query.prog_attach_flags); ulink_id = u64_to_user_ptr(attr->query.link_ids); ulink_flags = u64_to_user_ptr(attr->query.link_attach_flags); if (attr->query.count == 0 || !uprog_id || !count) return 0; if (attr->query.count < count) { count = attr->query.count; ret = -ENOSPC; } for (i = 0; i < bpf_mprog_max(); i++) { bpf_mprog_read(entry, i, &fp, &cp); prog = READ_ONCE(fp->prog); if (!prog) break; id = prog->aux->id; if (copy_to_user(uprog_id + i, &id, sizeof(id))) return -EFAULT; if (uprog_flags && copy_to_user(uprog_flags + i, &flags, sizeof(flags))) return -EFAULT; id = cp->link ? cp->link->id : 0; if (ulink_id && copy_to_user(ulink_id + i, &id, sizeof(id))) return -EFAULT; if (ulink_flags && copy_to_user(ulink_flags + i, &flags, sizeof(flags))) return -EFAULT; if (i + 1 == count) break; } return ret; }
10 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 /* SPDX-License-Identifier: GPL-2.0-only */ /* * File: af_phonet.h * * Phonet sockets kernel definitions * * Copyright (C) 2008 Nokia Corporation. */ #ifndef AF_PHONET_H #define AF_PHONET_H #include <linux/phonet.h> #include <linux/skbuff.h> #include <net/sock.h> /* * The lower layers may not require more space, ever. Make sure it's * enough. */ #define MAX_PHONET_HEADER (8 + MAX_HEADER) /* * Every Phonet* socket has this structure first in its * protocol-specific structure under name c. */ struct pn_sock { struct sock sk; u16 sobject; u16 dobject; u8 resource; }; static inline struct pn_sock *pn_sk(struct sock *sk) { return (struct pn_sock *)sk; } extern const struct proto_ops phonet_dgram_ops; void pn_sock_init(void); struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *sa); void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb); void phonet_get_local_port_range(int *min, int *max); int pn_sock_hash(struct sock *sk); void pn_sock_unhash(struct sock *sk); int pn_sock_get_port(struct sock *sk, unsigned short sport); struct sock *pn_find_sock_by_res(struct net *net, u8 res); int pn_sock_bind_res(struct sock *sock, u8 res); int pn_sock_unbind_res(struct sock *sk, u8 res); void pn_sock_unbind_all_res(struct sock *sk); int pn_skb_send(struct sock *sk, struct sk_buff *skb, const struct sockaddr_pn *target); static inline struct phonethdr *pn_hdr(struct sk_buff *skb) { return (struct phonethdr *)skb_network_header(skb); } static inline struct phonetmsg *pn_msg(struct sk_buff *skb) { return (struct phonetmsg *)skb_transport_header(skb); } /* * Get the other party's sockaddr from received skb. The skb begins * with a Phonet header. */ static inline void pn_skb_get_src_sockaddr(struct sk_buff *skb, struct sockaddr_pn *sa) { struct phonethdr *ph = pn_hdr(skb); u16 obj = pn_object(ph->pn_sdev, ph->pn_sobj); sa->spn_family = AF_PHONET; pn_sockaddr_set_object(sa, obj); pn_sockaddr_set_resource(sa, ph->pn_res); memset(sa->spn_zero, 0, sizeof(sa->spn_zero)); } static inline void pn_skb_get_dst_sockaddr(struct sk_buff *skb, struct sockaddr_pn *sa) { struct phonethdr *ph = pn_hdr(skb); u16 obj = pn_object(ph->pn_rdev, ph->pn_robj); sa->spn_family = AF_PHONET; pn_sockaddr_set_object(sa, obj); pn_sockaddr_set_resource(sa, ph->pn_res); memset(sa->spn_zero, 0, sizeof(sa->spn_zero)); } /* Protocols in Phonet protocol family. */ struct phonet_protocol { const struct proto_ops *ops; struct proto *prot; int sock_type; }; int phonet_proto_register(unsigned int protocol, const struct phonet_protocol *pp); void phonet_proto_unregister(unsigned int protocol, const struct phonet_protocol *pp); int phonet_sysctl_init(void); void phonet_sysctl_exit(void); int isi_register(void); void isi_unregister(void); static inline bool sk_is_phonet(struct sock *sk) { return sk->sk_family == PF_PHONET; } static inline int phonet_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { int karg; switch (cmd) { case SIOCPNADDRESOURCE: case SIOCPNDELRESOURCE: if (get_user(karg, (int __user *)arg)) return -EFAULT; return sk->sk_prot->ioctl(sk, cmd, &karg); } /* A positive return value means that the ioctl was not processed */ return 1; } #endif
68 6 3674 48 18 85 199 115 15 41 42 64 66 194 8 55 9 31 6 7 9 15 64 30 42 66 3 12 4 5 2 3 4 36 13 24 10 10 17 36 104 24 93 103 105 2 2 58 93 95 40 40 39 1 1 9 39 40 71 44 27 39 40 39 40 4 3 4 4 4 4 4 4 4 34 35 35 35 7 7 43 44 4 7 7 5 4 33 32 43 3 4 9 214 207 9 10 10 4 16 12 10 2 12 1 1 2 1 1 2 1 1 17 1 1 7 1 7 13 6 7 7 3 3 15 3 14 17 17 40 40 7 2069 2071 15 2069 3 20 4 16 3 14 1 12 65 208 214 1 8 1 205 208 1 206 1 3 1 71 131 158 1 158 161 65 26 79 1 81 26 20 38 2 338 2 4 6 8 7 6 2 4 1 10 17 63 1 1 4 1 60 1 7 53 57 1 1 36 17 61 179 1 5 1 177 1 13 160 175 1 104 69 178 42 1 6 1 4 35 5 3 30 16 15 42 56 1 4 1 15 1 37 4 48 48 25 26 54 11 1 9 1 1 5 7 7 36 1 34 1 34 1 1 29 13 17 1 16 1 17 1 1 15 5 13 14 1 2 1 9 5 9 2 25 1 1 2 2 18 1 1 3 15 16 16 1 6 2 7 24 6 1 2 1 1 1 5 1996 1 1994 2 700 702 700 703 702 3 1290 1290 1 5 1285 3 2 3 568 3 568 570 1 1 1 27 702 703 65 15 17 6 47 16 577 9 95 694 49 606 64 126 1912 33 31 1 36 2014 2006 1 3 2010 1520 60 1 132 1 128 34 80 35 17 66 1751 1305 2019 1 1 2013 2012 1 268 1751 1569 6 1 2009 3 2 5 2 3 1992 13 2 2 1994 3 1 1288 1 698 1 17 9 2 7 11 3 9 37 38 37 21 34 18 21 19 4 23 34 34 21 13 11 23 5 5 27 33 35 34 61 5 55 38 7 17 5 28 32 9 4 1 1 2 7 1 2 4 3 1 2 25 26 1 22 1 23 1 2 20 29 1 28 1 20 6 162 117 2 2 3 1 14 2 2 2 10 2 3 107 20 1 10 1 2 1 1 1 1 5 27 39 71 1 1 12 5 48 51 3 1 30 5 1 9 43 65 1 1 7 15 14 5 7 25 4 2 16 8 11 64 65 1 1 63 448 1 440 440 451 1 1 443 6 2 2 2 4 2 2 2 5 1 2 2 5 2 1 1 1 4 3 3 2 3 4 1 1 1 4 4 30 29 28 34 1 30 1 1 31 27 27 23 4 4 1 7 18 21 1 1 1 2 19 1 1 2 15 2 1 2 13 6 19 16 15 1 1 1 11 1 1 3 1 13 7 3 1 4 3 1 1 3 10 18 1 13 13 1 7 71 1 6 36 7 10 1 18 71 268 1 1 263 1 261 4 1 3 7 6 1 1 2 1 3 14 1 1 12 13 3 10 3 1 1 79 1 2 55 1 27 7 36 12 28 51 74 68 1 2 66 1 8 23 10 1 16 2 3 1 62 23 1 1 22 1 20 1 11 1 6 1 17 5 12 18 20 11 1 10 2 1 7 8 2 4 4 2 1 2 5 1 4 2 1 1 1 4 1 1 2 5 1 1 3 1 1 9 1 1 8 1 6 1 4 1 2 1 5 1 1 3 3688 3690 3680 1 211 66 180 42 57 6 2020 9 12 69 63 65 448 3 1 1 5 5 72 29 262 3 14 25 28 8 40 12 68 24 5 1 4 5 10 9 4 3689 2 1 1 2 2 3 3 1 2 50 1 1 4 1 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/bpf.h> #include <linux/bpf-cgroup.h> #include <linux/bpf_trace.h> #include <linux/bpf_lirc.h> #include <linux/bpf_verifier.h> #include <linux/bsearch.h> #include <linux/btf.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/vmalloc.h> #include <linux/mmzone.h> #include <linux/anon_inodes.h> #include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/license.h> #include <linux/filter.h> #include <linux/kernel.h> #include <linux/idr.h> #include <linux/cred.h> #include <linux/timekeeping.h> #include <linux/ctype.h> #include <linux/nospec.h> #include <linux/audit.h> #include <uapi/linux/btf.h> #include <linux/pgtable.h> #include <linux/bpf_lsm.h> #include <linux/poll.h> #include <linux/sort.h> #include <linux/bpf-netns.h> #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> #include <linux/trace_events.h> #include <linux/tracepoint.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> #include <net/tcx.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) #define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY) #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \ IS_FD_HASH(map)) #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) DEFINE_PER_CPU(int, bpf_prog_active); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); static DEFINE_SPINLOCK(map_idr_lock); static DEFINE_IDR(link_idr); static DEFINE_SPINLOCK(link_idr_lock); int sysctl_unprivileged_bpf_disabled __read_mostly = IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) \ [_id] = &_ops, #define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE }; /* * If we're handed a bigger struct than we know of, ensure all the unknown bits * are 0 - i.e. new user-space does not rely on any kernel feature extensions * we don't know about yet. * * There is a ToCToU between this function call and the following * copy_from_user() call. However, this is not a concern since this function is * meant to be a future-proofing of bits. */ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size) { int res; if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ return -E2BIG; if (actual_size <= expected_size) return 0; if (uaddr.is_kernel) res = memchr_inv(uaddr.kernel + expected_size, 0, actual_size - expected_size) == NULL; else res = check_zeroed_user(uaddr.user + expected_size, actual_size - expected_size); if (res < 0) return res; return res ? 0 : -E2BIG; } const struct bpf_map_ops bpf_map_offload_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = bpf_map_offload_map_alloc, .map_free = bpf_map_offload_map_free, .map_check_btf = map_check_no_btf, .map_mem_usage = bpf_map_offload_map_mem_usage, }; static void bpf_map_write_active_inc(struct bpf_map *map) { atomic64_inc(&map->writecnt); } static void bpf_map_write_active_dec(struct bpf_map *map) { atomic64_dec(&map->writecnt); } bool bpf_map_write_active(const struct bpf_map *map) { return atomic64_read(&map->writecnt) != 0; } static u32 bpf_map_value_size(const struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) return sizeof(u32); else return map->value_size; } static void maybe_wait_bpf_programs(struct bpf_map *map) { /* Wait for any running non-sleepable BPF programs to complete so that * userspace, when we return to it, knows that all non-sleepable * programs that could be running use the new map value. For sleepable * BPF programs, synchronize_rcu_tasks_trace() should be used to wait * for the completions of these programs, but considering the waiting * time can be very long and userspace may think it will hang forever, * so don't handle sleepable BPF programs now. */ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) synchronize_rcu(); } static void unpin_uptr_kaddr(void *kaddr) { if (kaddr) unpin_user_page(virt_to_page(kaddr)); } static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj) { const struct btf_field *field; void **uptr_addr; int i; for (i = 0, field = rec->fields; i < cnt; i++, field++) { if (field->type != BPF_UPTR) continue; uptr_addr = obj + field->offset; unpin_uptr_kaddr(*uptr_addr); } } static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj) { if (!btf_record_has_field(rec, BPF_UPTR)) return; __bpf_obj_unpin_uptrs(rec, rec->cnt, obj); } static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj) { const struct btf_field *field; const struct btf_type *t; unsigned long start, end; struct page *page; void **uptr_addr; int i, err; if (!btf_record_has_field(rec, BPF_UPTR)) return 0; for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) { if (field->type != BPF_UPTR) continue; uptr_addr = obj + field->offset; start = *(unsigned long *)uptr_addr; if (!start) continue; t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id); /* t->size was checked for zero before */ if (check_add_overflow(start, t->size - 1, &end)) { err = -EFAULT; goto unpin_all; } /* The uptr's struct cannot span across two pages */ if ((start & PAGE_MASK) != (end & PAGE_MASK)) { err = -EOPNOTSUPP; goto unpin_all; } err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page); if (err != 1) goto unpin_all; if (PageHighMem(page)) { err = -EOPNOTSUPP; unpin_user_page(page); goto unpin_all; } *uptr_addr = page_address(page) + offset_in_page(start); } return 0; unpin_all: __bpf_obj_unpin_uptrs(rec, i, obj); return err; } static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, void *key, void *value, __u64 flags) { int err; /* Need to create a kthread, thus must support schedule */ if (bpf_map_is_offloaded(map)) { return bpf_map_offload_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_CPUMAP || map->map_type == BPF_MAP_TYPE_ARENA || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { return map->ops->map_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH || map->map_type == BPF_MAP_TYPE_SOCKMAP) { return sock_map_update_elem_sys(map, key, value, flags); } else if (IS_FD_PROG_ARRAY(map)) { return bpf_fd_array_map_update_elem(map, map_file, key, value, flags); } bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_update(map, key, value, flags); } else if (IS_FD_ARRAY(map)) { err = bpf_fd_array_map_update_elem(map, map_file, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { err = bpf_fd_htab_map_update_elem(map, map_file, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { /* rcu_read_lock() is not needed */ err = bpf_fd_reuseport_array_update_elem(map, key, value, flags); } else if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK || map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { err = map->ops->map_push_elem(map, value, flags); } else { err = bpf_obj_pin_uptrs(map->record, value); if (!err) { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, flags); rcu_read_unlock(); if (err) bpf_obj_unpin_uptrs(map->record, value); } } bpf_enable_instrumentation(); return err; } static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, __u64 flags) { void *ptr; int err; if (bpf_map_is_offloaded(map)) return bpf_map_offload_lookup_elem(map, key, value); bpf_disable_instrumentation(); if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { err = bpf_fd_htab_map_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { err = bpf_fd_reuseport_array_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK || map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { err = map->ops->map_peek_elem(map, value); } else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { /* struct_ops map requires directly updating "value" */ err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); } else { rcu_read_lock(); if (map->ops->map_lookup_elem_sys_only) ptr = map->ops->map_lookup_elem_sys_only(map, key); else ptr = map->ops->map_lookup_elem(map, key); if (IS_ERR(ptr)) { err = PTR_ERR(ptr); } else if (!ptr) { err = -ENOENT; } else { err = 0; if (flags & BPF_F_LOCK) /* lock 'ptr' and copy everything but lock */ copy_map_value_locked(map, value, ptr, true); else copy_map_value(map, value, ptr); /* mask lock and timer, since value wasn't zero inited */ check_and_init_map_value(map, value); } rcu_read_unlock(); } bpf_enable_instrumentation(); return err; } /* Please, do not use this function outside from the map creation path * (e.g. in map update path) without taking care of setting the active * memory cgroup (see at bpf_map_kmalloc_node() for example). */ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, * which is used for lower order allocation requests. * * It has been observed that higher order allocation requests done by * vmalloc with __GFP_NORETRY being set might fail due to not trying * to reclaim memory from the page cache, thus we set * __GFP_RETRY_MAYFAIL to avoid such situations. */ gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO); unsigned int flags = 0; unsigned long align = 1; void *area; if (size >= SIZE_MAX) return NULL; /* kmalloc()'ed memory can't be mmap()'ed */ if (mmapable) { BUG_ON(!PAGE_ALIGNED(size)); align = SHMLBA; flags = VM_USERMAP; } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, numa_node); if (area != NULL) return area; } return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, flags, numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) { return __bpf_map_area_alloc(size, numa_node, false); } void *bpf_map_area_mmapable_alloc(u64 size, int numa_node) { return __bpf_map_area_alloc(size, numa_node, true); } void bpf_map_area_free(void *area) { kvfree(area); } static u32 bpf_map_flags_retain_permanent(u32 flags) { /* Some map creation flags are not tied to the map object but * rather to the map fd instead, so they have no meaning upon * map object inspection since multiple file descriptors with * different (access) properties can exist here. Thus, given * this has zero meaning for the map itself, lets clear these * from here. */ return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); } void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) { map->map_type = attr->map_type; map->key_size = attr->key_size; map->value_size = attr->value_size; map->max_entries = attr->max_entries; map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); map->numa_node = bpf_map_attr_numa_node(attr); map->map_extra = attr->map_extra; } static int bpf_map_alloc_id(struct bpf_map *map) { int id; idr_preload(GFP_KERNEL); spin_lock_bh(&map_idr_lock); id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); if (id > 0) map->id = id; spin_unlock_bh(&map_idr_lock); idr_preload_end(); if (WARN_ON_ONCE(!id)) return -ENOSPC; return id > 0 ? 0 : id; } void bpf_map_free_id(struct bpf_map *map) { unsigned long flags; /* Offloaded maps are removed from the IDR store when their device * disappears - even if someone holds an fd to them they are unusable, * the memory is gone, all ops will fail; they are simply waiting for * refcnt to drop to be freed. */ if (!map->id) return; spin_lock_irqsave(&map_idr_lock, flags); idr_remove(&map_idr, map->id); map->id = 0; spin_unlock_irqrestore(&map_idr_lock, flags); } #ifdef CONFIG_MEMCG static void bpf_map_save_memcg(struct bpf_map *map) { /* Currently if a map is created by a process belonging to the root * memory cgroup, get_obj_cgroup_from_current() will return NULL. * So we have to check map->objcg for being NULL each time it's * being used. */ if (memcg_bpf_enabled()) map->objcg = get_obj_cgroup_from_current(); } static void bpf_map_release_memcg(struct bpf_map *map) { if (map->objcg) obj_cgroup_put(map->objcg); } static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map) { if (map->objcg) return get_mem_cgroup_from_objcg(map->objcg); return root_mem_cgroup; } void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node) { struct mem_cgroup *memcg, *old_memcg; void *ptr; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); set_active_memcg(old_memcg); mem_cgroup_put(memcg); return ptr; } void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; void *ptr; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); ptr = kzalloc(size, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); mem_cgroup_put(memcg); return ptr; } void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; void *ptr; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); mem_cgroup_put(memcg); return ptr; } void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags) { struct mem_cgroup *memcg, *old_memcg; void __percpu *ptr; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); set_active_memcg(old_memcg); mem_cgroup_put(memcg); return ptr; } #else static void bpf_map_save_memcg(struct bpf_map *map) { } static void bpf_map_release_memcg(struct bpf_map *map) { } #endif int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, unsigned long nr_pages, struct page **pages) { unsigned long i, j; struct page *pg; int ret = 0; #ifdef CONFIG_MEMCG struct mem_cgroup *memcg, *old_memcg; memcg = bpf_map_get_memcg(map); old_memcg = set_active_memcg(memcg); #endif for (i = 0; i < nr_pages; i++) { pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); if (pg) { pages[i] = pg; continue; } for (j = 0; j < i; j++) __free_page(pages[j]); ret = -ENOMEM; break; } #ifdef CONFIG_MEMCG set_active_memcg(old_memcg); mem_cgroup_put(memcg); #endif return ret; } static int btf_field_cmp(const void *a, const void *b) { const struct btf_field *f1 = a, *f2 = b; if (f1->offset < f2->offset) return -1; else if (f1->offset > f2->offset) return 1; return 0; } struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, u32 field_mask) { struct btf_field *field; if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask)) return NULL; field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); if (!field || !(field->type & field_mask)) return NULL; return field; } void btf_record_free(struct btf_record *rec) { int i; if (IS_ERR_OR_NULL(rec)) return; for (i = 0; i < rec->cnt; i++) { switch (rec->fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: if (rec->fields[i].kptr.module) module_put(rec->fields[i].kptr.module); if (btf_is_kernel(rec->fields[i].kptr.btf)) btf_put(rec->fields[i].kptr.btf); break; case BPF_LIST_HEAD: case BPF_LIST_NODE: case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: /* Nothing to release */ break; default: WARN_ON_ONCE(1); continue; } } kfree(rec); } void bpf_map_free_record(struct bpf_map *map) { btf_record_free(map->record); map->record = NULL; } struct btf_record *btf_record_dup(const struct btf_record *rec) { const struct btf_field *fields; struct btf_record *new_rec; int ret, size, i; if (IS_ERR_OR_NULL(rec)) return NULL; size = offsetof(struct btf_record, fields[rec->cnt]); new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); if (!new_rec) return ERR_PTR(-ENOMEM); /* Do a deep copy of the btf_record */ fields = rec->fields; new_rec->cnt = 0; for (i = 0; i < rec->cnt; i++) { switch (fields[i].type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: if (btf_is_kernel(fields[i].kptr.btf)) btf_get(fields[i].kptr.btf); if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { ret = -ENXIO; goto free; } break; case BPF_LIST_HEAD: case BPF_LIST_NODE: case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: /* Nothing to acquire */ break; default: ret = -EFAULT; WARN_ON_ONCE(1); goto free; } new_rec->cnt++; } return new_rec; free: btf_record_free(new_rec); return ERR_PTR(ret); } bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b) { bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b); int size; if (!a_has_fields && !b_has_fields) return true; if (a_has_fields != b_has_fields) return false; if (rec_a->cnt != rec_b->cnt) return false; size = offsetof(struct btf_record, fields[rec_a->cnt]); /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused * members are zeroed out. So memcmp is safe to do without worrying * about padding/unused fields. * * While spin_lock, timer, and kptr have no relation to map BTF, * list_head metadata is specific to map BTF, the btf and value_rec * members in particular. btf is the map BTF, while value_rec points to * btf_record in that map BTF. * * So while by default, we don't rely on the map BTF (which the records * were parsed from) matching for both records, which is not backwards * compatible, in case list_head is part of it, we implicitly rely on * that by way of depending on memcmp succeeding for it. */ return !memcmp(rec_a, rec_b, size); } void bpf_obj_free_timer(const struct btf_record *rec, void *obj) { if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER))) return; bpf_timer_cancel_and_free(obj + rec->timer_off); } void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) { if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE))) return; bpf_wq_cancel_and_free(obj + rec->wq_off); } void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; int i; if (IS_ERR_OR_NULL(rec)) return; fields = rec->fields; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *pointee_struct_meta; const struct btf_field *field = &fields[i]; void *field_ptr = obj + field->offset; void *xchgd_field; switch (fields[i].type) { case BPF_SPIN_LOCK: break; case BPF_TIMER: bpf_timer_cancel_and_free(field_ptr); break; case BPF_WORKQUEUE: bpf_wq_cancel_and_free(field_ptr); break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; case BPF_KPTR_REF: case BPF_KPTR_PERCPU: xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); if (!xchgd_field) break; if (!btf_is_kernel(field->kptr.btf)) { pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); migrate_disable(); __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? pointee_struct_meta->record : NULL, fields[i].type == BPF_KPTR_PERCPU); migrate_enable(); } else { field->kptr.dtor(xchgd_field); } break; case BPF_UPTR: /* The caller ensured that no one is using the uptr */ unpin_uptr_kaddr(*(void **)field_ptr); break; case BPF_LIST_HEAD: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) continue; bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off); break; case BPF_RB_ROOT: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) continue; bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off); break; case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: break; default: WARN_ON_ONCE(1); continue; } } } static void bpf_map_free(struct bpf_map *map) { struct btf_record *rec = map->record; struct btf *btf = map->btf; /* implementation dependent freeing */ map->ops->map_free(map); /* Delay freeing of btf_record for maps, as map_free * callback usually needs access to them. It is better to do it here * than require each callback to do the free itself manually. * * Note that the btf_record stashed in map->inner_map_meta->record was * already freed using the map_free callback for map in map case which * eventually calls bpf_map_free_meta, since inner_map_meta is only a * template bpf_map struct used during verification. */ btf_record_free(rec); /* Delay freeing of btf for maps, as map_free callback may need * struct_meta info which will be freed with btf_put(). */ btf_put(btf); } /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); security_bpf_map_free(map); bpf_map_release_memcg(map); bpf_map_free(map); } static void bpf_map_put_uref(struct bpf_map *map) { if (atomic64_dec_and_test(&map->usercnt)) { if (map->ops->map_release_uref) map->ops->map_release_uref(map); } } static void bpf_map_free_in_work(struct bpf_map *map) { INIT_WORK(&map->work, bpf_map_free_deferred); /* Avoid spawning kworkers, since they all might contend * for the same mutex like slab_mutex. */ queue_work(system_unbound_wq, &map->work); } static void bpf_map_free_rcu_gp(struct rcu_head *rcu) { bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); } static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu) { if (rcu_trace_implies_rcu_gp()) bpf_map_free_rcu_gp(rcu); else call_rcu(rcu, bpf_map_free_rcu_gp); } /* decrement map refcnt and schedule it for freeing via workqueue * (underlying map implementation ops->map_free() might sleep) */ void bpf_map_put(struct bpf_map *map) { if (atomic64_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt)); if (READ_ONCE(map->free_after_mult_rcu_gp)) call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); else if (READ_ONCE(map->free_after_rcu_gp)) call_rcu(&map->rcu, bpf_map_free_rcu_gp); else bpf_map_free_in_work(map); } } EXPORT_SYMBOL_GPL(bpf_map_put); void bpf_map_put_with_uref(struct bpf_map *map) { bpf_map_put_uref(map); bpf_map_put(map); } static int bpf_map_release(struct inode *inode, struct file *filp) { struct bpf_map *map = filp->private_data; if (map->ops->map_release) map->ops->map_release(map, filp); bpf_map_put_with_uref(map); return 0; } static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) { fmode_t mode = fd_file(f)->f_mode; /* Our file permissions may have been overridden by global * map permissions facing syscall side. */ if (READ_ONCE(map->frozen)) mode &= ~FMODE_CAN_WRITE; return mode; } #ifdef CONFIG_PROC_FS /* Show the memory usage of a bpf map */ static u64 bpf_map_memory_usage(const struct bpf_map *map) { return map->ops->map_mem_usage(map); } static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { struct bpf_map *map = filp->private_data; u32 type = 0, jited = 0; if (map_type_contains_progs(map)) { spin_lock(&map->owner.lock); type = map->owner.type; jited = map->owner.jited; spin_unlock(&map->owner.lock); } seq_printf(m, "map_type:\t%u\n" "key_size:\t%u\n" "value_size:\t%u\n" "max_entries:\t%u\n" "map_flags:\t%#x\n" "map_extra:\t%#llx\n" "memlock:\t%llu\n" "map_id:\t%u\n" "frozen:\t%u\n", map->map_type, map->key_size, map->value_size, map->max_entries, map->map_flags, (unsigned long long)map->map_extra, bpf_map_memory_usage(map), map->id, READ_ONCE(map->frozen)); if (type) { seq_printf(m, "owner_prog_type:\t%u\n", type); seq_printf(m, "owner_jited:\t%u\n", jited); } } #endif static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) { /* We need this handler such that alloc_file() enables * f_mode with FMODE_CAN_READ. */ return -EINVAL; } static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, size_t siz, loff_t *ppos) { /* We need this handler such that alloc_file() enables * f_mode with FMODE_CAN_WRITE. */ return -EINVAL; } /* called for any extra memory-mapped regions (except initial) */ static void bpf_map_mmap_open(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; if (vma->vm_flags & VM_MAYWRITE) bpf_map_write_active_inc(map); } /* called for all unmapped memory region (including initial) */ static void bpf_map_mmap_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; if (vma->vm_flags & VM_MAYWRITE) bpf_map_write_active_dec(map); } static const struct vm_operations_struct bpf_map_default_vmops = { .open = bpf_map_mmap_open, .close = bpf_map_mmap_close, }; static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) { struct bpf_map *map = filp->private_data; int err; if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; mutex_lock(&map->freeze_mutex); if (vma->vm_flags & VM_WRITE) { if (map->frozen) { err = -EPERM; goto out; } /* map is meant to be read-only, so do not allow mapping as * writable, because it's possible to leak a writable page * reference and allows user-space to still modify it after * freezing, while verifier will assume contents do not change */ if (map->map_flags & BPF_F_RDONLY_PROG) { err = -EACCES; goto out; } } /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; vm_flags_clear(vma, VM_MAYEXEC); if (!(vma->vm_flags & VM_WRITE)) /* disallow re-mapping with PROT_WRITE */ vm_flags_clear(vma, VM_MAYWRITE); err = map->ops->map_mmap(map, vma); if (err) goto out; if (vma->vm_flags & VM_MAYWRITE) bpf_map_write_active_inc(map); out: mutex_unlock(&map->freeze_mutex); return err; } static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts) { struct bpf_map *map = filp->private_data; if (map->ops->map_poll) return map->ops->map_poll(map, filp, pts); return EPOLLERR; } static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct bpf_map *map = filp->private_data; if (map->ops->map_get_unmapped_area) return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags); #ifdef CONFIG_MMU return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); #else return addr; #endif } const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, #endif .release = bpf_map_release, .read = bpf_dummy_read, .write = bpf_dummy_write, .mmap = bpf_map_mmap, .poll = bpf_map_poll, .get_unmapped_area = bpf_get_unmapped_area, }; int bpf_map_new_fd(struct bpf_map *map, int flags) { int ret; ret = security_bpf_map(map, OPEN_FMODE(flags)); if (ret < 0) return ret; return anon_inode_getfd("bpf-map", &bpf_map_fops, map, flags | O_CLOEXEC); } int bpf_get_file_flag(int flags) { if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY)) return -EINVAL; if (flags & BPF_F_RDONLY) return O_RDONLY; if (flags & BPF_F_WRONLY) return O_WRONLY; return O_RDWR; } /* helper macro to check that unused fields 'union bpf_attr' are zero */ #define CHECK_ATTR(CMD) \ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ sizeof(attr->CMD##_LAST_FIELD), 0, \ sizeof(*attr) - \ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL /* dst and src must have at least "size" number of bytes. * Return strlen on success and < 0 on error. */ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) { const char *end = src + size; const char *orig_src = src; memset(dst, 0, size); /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { if (!isalnum(*src) && *src != '_' && *src != '.') return -EINVAL; *dst++ = *src++; } /* No '\0' found in "size" number of bytes */ if (src == end) return -EINVAL; return src - orig_src; } int map_check_no_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { return -ENOTSUPP; } static int map_check_btf(struct bpf_map *map, struct bpf_token *token, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { const struct btf_type *key_type, *value_type; u32 key_size, value_size; int ret = 0; /* Some maps allow key to be unspecified. */ if (btf_key_id) { key_type = btf_type_id_size(btf, &btf_key_id, &key_size); if (!key_type || key_size != map->key_size) return -EINVAL; } else { key_type = btf_type_by_id(btf, 0); if (!map->ops->map_check_btf) return -EINVAL; } value_type = btf_type_id_size(btf, &btf_value_id, &value_size); if (!value_type || value_size != map->value_size) return -EINVAL; map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; if (!bpf_token_capable(token, CAP_BPF)) { ret = -EPERM; goto free_map_tab; } if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) { ret = -EACCES; goto free_map_tab; } for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) { switch (map->record->field_mask & (1 << i)) { case 0: continue; case BPF_SPIN_LOCK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && map->map_type != BPF_MAP_TYPE_SK_STORAGE && map->map_type != BPF_MAP_TYPE_INODE_STORAGE && map->map_type != BPF_MAP_TYPE_TASK_STORAGE && map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { ret = -EOPNOTSUPP; goto free_map_tab; } break; case BPF_TIMER: case BPF_WORKQUEUE: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { ret = -EOPNOTSUPP; goto free_map_tab; } break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_REFCOUNT: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY && map->map_type != BPF_MAP_TYPE_SK_STORAGE && map->map_type != BPF_MAP_TYPE_INODE_STORAGE && map->map_type != BPF_MAP_TYPE_TASK_STORAGE && map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { ret = -EOPNOTSUPP; goto free_map_tab; } break; case BPF_UPTR: if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) { ret = -EOPNOTSUPP; goto free_map_tab; } break; case BPF_LIST_HEAD: case BPF_RB_ROOT: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { ret = -EOPNOTSUPP; goto free_map_tab; } break; default: /* Fail if map_type checks are missing for a field type */ ret = -EOPNOTSUPP; goto free_map_tab; } } } ret = btf_check_and_fixup_fields(btf, map->record); if (ret < 0) goto free_map_tab; if (map->ops->map_check_btf) { ret = map->ops->map_check_btf(map, btf, key_type, value_type); if (ret < 0) goto free_map_tab; } return ret; free_map_tab: bpf_map_free_record(map); return ret; } static bool bpf_net_capable(void) { return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); } #define BPF_MAP_CREATE_LAST_FIELD map_token_fd /* called via syscall */ static int map_create(union bpf_attr *attr) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; int numa_node = bpf_map_attr_numa_node(attr); u32 map_type = attr->map_type; struct bpf_map *map; bool token_flag; int f_flags; int err; err = CHECK_ATTR(BPF_MAP_CREATE); if (err) return -EINVAL; /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it * to avoid per-map type checks tripping on unknown flag */ token_flag = attr->map_flags & BPF_F_TOKEN_FD; attr->map_flags &= ~BPF_F_TOKEN_FD; if (attr->btf_vmlinux_value_type_id) { if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || attr->btf_key_type_id || attr->btf_value_type_id) return -EINVAL; } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { return -EINVAL; } if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && attr->map_type != BPF_MAP_TYPE_ARENA && attr->map_extra != 0) return -EINVAL; f_flags = bpf_get_file_flag(attr->map_flags); if (f_flags < 0) return f_flags; if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || !node_online(numa_node))) return -EINVAL; /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map_type = attr->map_type; if (map_type >= ARRAY_SIZE(bpf_map_types)) return -EINVAL; map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types)); ops = bpf_map_types[map_type]; if (!ops) return -EINVAL; if (ops->map_alloc_check) { err = ops->map_alloc_check(attr); if (err) return err; } if (attr->map_ifindex) ops = &bpf_map_offload_ops; if (!ops->map_mem_usage) return -EINVAL; if (token_flag) { token = bpf_token_get_from_fd(attr->map_token_fd); if (IS_ERR(token)) return PTR_ERR(token); /* if current token doesn't grant map creation permissions, * then we can't use this token, so ignore it and rely on * system-wide capabilities checks */ if (!bpf_token_allow_cmd(token, BPF_MAP_CREATE) || !bpf_token_allow_map_type(token, attr->map_type)) { bpf_token_put(token); token = NULL; } } err = -EPERM; /* Intent here is for unprivileged_bpf_disabled to block BPF map * creation for unprivileged users; other actions depend * on fd availability and access to bpffs, so are dependent on * object creation success. Even with unprivileged BPF disabled, * capability checks are still carried out. */ if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF)) goto put_token; /* check privileged map type permissions */ switch (map_type) { case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PERCPU_ARRAY: case BPF_MAP_TYPE_PROG_ARRAY: case BPF_MAP_TYPE_PERF_EVENT_ARRAY: case BPF_MAP_TYPE_CGROUP_ARRAY: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: case BPF_MAP_TYPE_USER_RINGBUF: case BPF_MAP_TYPE_CGROUP_STORAGE: case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: /* unprivileged */ break; case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: case BPF_MAP_TYPE_CGRP_STORAGE: case BPF_MAP_TYPE_BLOOM_FILTER: case BPF_MAP_TYPE_LPM_TRIE: case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: case BPF_MAP_TYPE_STACK_TRACE: case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: case BPF_MAP_TYPE_LRU_HASH: case BPF_MAP_TYPE_LRU_PERCPU_HASH: case BPF_MAP_TYPE_STRUCT_OPS: case BPF_MAP_TYPE_CPUMAP: case BPF_MAP_TYPE_ARENA: if (!bpf_token_capable(token, CAP_BPF)) goto put_token; break; case BPF_MAP_TYPE_SOCKMAP: case BPF_MAP_TYPE_SOCKHASH: case BPF_MAP_TYPE_DEVMAP: case BPF_MAP_TYPE_DEVMAP_HASH: case BPF_MAP_TYPE_XSKMAP: if (!bpf_token_capable(token, CAP_NET_ADMIN)) goto put_token; break; default: WARN(1, "unsupported map type %d", map_type); goto put_token; } map = ops->map_alloc(attr); if (IS_ERR(map)) { err = PTR_ERR(map); goto put_token; } map->ops = ops; map->map_type = map_type; err = bpf_obj_name_cpy(map->name, attr->map_name, sizeof(attr->map_name)); if (err < 0) goto free_map; atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); spin_lock_init(&map->owner.lock); if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, * the bpf_prog.o must have BTF to begin with * to figure out the corresponding kernel's * counter part. Thus, attr->btf_fd has * to be valid also. */ attr->btf_vmlinux_value_type_id) { struct btf *btf; btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { err = PTR_ERR(btf); goto free_map; } if (btf_is_kernel(btf)) { btf_put(btf); err = -EACCES; goto free_map; } map->btf = btf; if (attr->btf_value_type_id) { err = map_check_btf(map, token, btf, attr->btf_key_type_id, attr->btf_value_type_id); if (err) goto free_map; } map->btf_key_type_id = attr->btf_key_type_id; map->btf_value_type_id = attr->btf_value_type_id; map->btf_vmlinux_value_type_id = attr->btf_vmlinux_value_type_id; } err = security_bpf_map_create(map, attr, token); if (err) goto free_map_sec; err = bpf_map_alloc_id(map); if (err) goto free_map_sec; bpf_map_save_memcg(map); bpf_token_put(token); err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. * bpf_map_put_with_uref() is needed because the above * bpf_map_alloc_id() has published the map * to the userspace and the userspace may * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. */ bpf_map_put_with_uref(map); return err; } return err; free_map_sec: security_bpf_map_free(map); free_map: bpf_map_free(map); put_token: bpf_token_put(token); return err; } void bpf_map_inc(struct bpf_map *map) { atomic64_inc(&map->refcnt); } EXPORT_SYMBOL_GPL(bpf_map_inc); void bpf_map_inc_with_uref(struct bpf_map *map) { atomic64_inc(&map->refcnt); atomic64_inc(&map->usercnt); } EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref); struct bpf_map *bpf_map_get(u32 ufd) { CLASS(fd, f)(ufd); struct bpf_map *map = __bpf_map_get(f); if (!IS_ERR(map)) bpf_map_inc(map); return map; } EXPORT_SYMBOL(bpf_map_get); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { CLASS(fd, f)(ufd); struct bpf_map *map = __bpf_map_get(f); if (!IS_ERR(map)) bpf_map_inc_with_uref(map); return map; } /* map_idr_lock should have been held or the map should have been * protected by rcu read lock. */ struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) { int refold; refold = atomic64_fetch_add_unless(&map->refcnt, 1, 0); if (!refold) return ERR_PTR(-ENOENT); if (uref) atomic64_inc(&map->usercnt); return map; } struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) { spin_lock_bh(&map_idr_lock); map = __bpf_map_inc_not_zero(map, false); spin_unlock_bh(&map_idr_lock); return map; } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; } static void *__bpf_copy_key(void __user *ukey, u64 key_size) { if (key_size) return vmemdup_user(ukey, key_size); if (ukey) return ERR_PTR(-EINVAL); return NULL; } static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size) { if (key_size) return kvmemdup_bpfptr(ukey, key_size); if (!bpfptr_is_null(ukey)) return ERR_PTR(-EINVAL); return NULL; } /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags static int map_lookup_elem(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key); void __user *uvalue = u64_to_user_ptr(attr->value); struct bpf_map *map; void *key, *value; u32 value_size; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; if (attr->flags & ~BPF_F_LOCK) return -EINVAL; CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; if ((attr->flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) return PTR_ERR(key); value_size = bpf_map_value_size(map); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) { if (copy_from_user(value, uvalue, value_size)) err = -EFAULT; else err = bpf_map_copy_value(map, key, value, attr->flags); goto free_value; } err = bpf_map_copy_value(map, key, value, attr->flags); if (err) goto free_value; err = -EFAULT; if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; err = 0; free_value: kvfree(value); free_key: kvfree(key); return err; } #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) { bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel); struct bpf_map *map; void *key, *value; u32 value_size; int err; if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) return -EINVAL; CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } if ((attr->flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; } value_size = bpf_map_value_size(map); value = kvmemdup_bpfptr(uvalue, value_size); if (IS_ERR(value)) { err = PTR_ERR(value); goto free_key; } err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags); if (!err) maybe_wait_bpf_programs(map); kvfree(value); free_key: kvfree(key); err_put: bpf_map_write_active_dec(map); return err; } #define BPF_MAP_DELETE_ELEM_LAST_FIELD key static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr) { bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel); struct bpf_map *map; void *key; int err; if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) return -EINVAL; CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } key = ___bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; } if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_delete_elem(map, key); goto out; } else if (IS_FD_PROG_ARRAY(map) || map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { /* These maps require sleepable context */ err = map->ops->map_delete_elem(map, key); goto out; } bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); bpf_enable_instrumentation(); if (!err) maybe_wait_bpf_programs(map); out: kvfree(key); err_put: bpf_map_write_active_dec(map); return err; } /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key static int map_get_next_key(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key); void __user *unext_key = u64_to_user_ptr(attr->next_key); struct bpf_map *map; void *key, *next_key; int err; if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) return -EINVAL; CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) return -EPERM; if (ukey) { key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) return PTR_ERR(key); } else { key = NULL; } err = -ENOMEM; next_key = kvmalloc(map->key_size, GFP_USER); if (!next_key) goto free_key; if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_get_next_key(map, key, next_key); goto out; } rcu_read_lock(); err = map->ops->map_get_next_key(map, key, next_key); rcu_read_unlock(); out: if (err) goto free_next_key; err = -EFAULT; if (copy_to_user(unext_key, next_key, map->key_size) != 0) goto free_next_key; err = 0; free_next_key: kvfree(next_key); free_key: kvfree(key); return err; } int generic_map_delete_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { void __user *keys = u64_to_user_ptr(attr->batch.keys); u32 cp, max_count; int err = 0; void *key; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } max_count = attr->batch.count; if (!max_count) return 0; if (put_user(0, &uattr->batch.count)) return -EFAULT; key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; for (cp = 0; cp < max_count; cp++) { err = -EFAULT; if (copy_from_user(key, keys + cp * map->key_size, map->key_size)) break; if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_delete_elem(map, key); break; } bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); bpf_enable_instrumentation(); if (err) break; cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; kvfree(key); return err; } int generic_map_update_batch(struct bpf_map *map, struct file *map_file, const union bpf_attr *attr, union bpf_attr __user *uattr) { void __user *values = u64_to_user_ptr(attr->batch.values); void __user *keys = u64_to_user_ptr(attr->batch.keys); u32 value_size, cp, max_count; void *key, *value; int err = 0; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } value_size = bpf_map_value_size(map); max_count = attr->batch.count; if (!max_count) return 0; if (put_user(0, &uattr->batch.count)) return -EFAULT; key = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!key) return -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) { kvfree(key); return -ENOMEM; } for (cp = 0; cp < max_count; cp++) { err = -EFAULT; if (copy_from_user(key, keys + cp * map->key_size, map->key_size) || copy_from_user(value, values + cp * value_size, value_size)) break; err = bpf_map_update_value(map, map_file, key, value, attr->batch.elem_flags); if (err) break; cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; kvfree(value); kvfree(key); return err; } #define MAP_LOOKUP_RETRIES 3 int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch); void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); void __user *values = u64_to_user_ptr(attr->batch.values); void __user *keys = u64_to_user_ptr(attr->batch.keys); void *buf, *buf_prevkey, *prev_key, *key, *value; int err, retry = MAP_LOOKUP_RETRIES; u32 value_size, cp, max_count; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; value_size = bpf_map_value_size(map); max_count = attr->batch.count; if (!max_count) return 0; if (put_user(0, &uattr->batch.count)) return -EFAULT; buf_prevkey = kvmalloc(map->key_size, GFP_USER | __GFP_NOWARN); if (!buf_prevkey) return -ENOMEM; buf = kvmalloc(map->key_size + value_size, GFP_USER | __GFP_NOWARN); if (!buf) { kvfree(buf_prevkey); return -ENOMEM; } err = -EFAULT; prev_key = NULL; if (ubatch && copy_from_user(buf_prevkey, ubatch, map->key_size)) goto free_buf; key = buf; value = key + map->key_size; if (ubatch) prev_key = buf_prevkey; for (cp = 0; cp < max_count;) { rcu_read_lock(); err = map->ops->map_get_next_key(map, prev_key, key); rcu_read_unlock(); if (err) break; err = bpf_map_copy_value(map, key, value, attr->batch.elem_flags); if (err == -ENOENT) { if (retry) { retry--; continue; } err = -EINTR; break; } if (err) goto free_buf; if (copy_to_user(keys + cp * map->key_size, key, map->key_size)) { err = -EFAULT; goto free_buf; } if (copy_to_user(values + cp * value_size, value, value_size)) { err = -EFAULT; goto free_buf; } if (!prev_key) prev_key = buf_prevkey; swap(prev_key, key); retry = MAP_LOOKUP_RETRIES; cp++; cond_resched(); } if (err == -EFAULT) goto free_buf; if ((copy_to_user(&uattr->batch.count, &cp, sizeof(cp)) || (cp && copy_to_user(uobatch, prev_key, map->key_size)))) err = -EFAULT; free_buf: kvfree(buf_prevkey); kvfree(buf); return err; } #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags static int map_lookup_and_delete_elem(union bpf_attr *attr) { void __user *ukey = u64_to_user_ptr(attr->key); void __user *uvalue = u64_to_user_ptr(attr->value); struct bpf_map *map; void *key, *value; u32 value_size; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) return -EINVAL; if (attr->flags & ~BPF_F_LOCK) return -EINVAL; CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } if (attr->flags && (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK)) { err = -EINVAL; goto err_put; } if ((attr->flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); goto err_put; } value_size = bpf_map_value_size(map); err = -ENOMEM; value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; err = -ENOTSUPP; if (map->map_type == BPF_MAP_TYPE_QUEUE || map->map_type == BPF_MAP_TYPE_STACK) { err = map->ops->map_pop_elem(map, value); } else if (map->map_type == BPF_MAP_TYPE_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); rcu_read_lock(); err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags); rcu_read_unlock(); bpf_enable_instrumentation(); } } if (err) goto free_value; if (copy_to_user(uvalue, value, value_size) != 0) { err = -EFAULT; goto free_value; } err = 0; free_value: kvfree(value); free_key: kvfree(key); err_put: bpf_map_write_active_dec(map); return err; } #define BPF_MAP_FREEZE_LAST_FIELD map_fd static int map_freeze(const union bpf_attr *attr) { int err = 0; struct bpf_map *map; if (CHECK_ATTR(BPF_MAP_FREEZE)) return -EINVAL; CLASS(fd, f)(attr->map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) return -EPERM; mutex_lock(&map->freeze_mutex); if (bpf_map_write_active(map)) { err = -EBUSY; goto err_put; } if (READ_ONCE(map->frozen)) { err = -EBUSY; goto err_put; } WRITE_ONCE(map->frozen, true); err_put: mutex_unlock(&map->freeze_mutex); return err; } static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \ [_id] = & _name ## _prog_ops, #define BPF_MAP_TYPE(_id, _ops) #define BPF_LINK_TYPE(_id, _name) #include <linux/bpf_types.h> #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE }; static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) { const struct bpf_prog_ops *ops; if (type >= ARRAY_SIZE(bpf_prog_types)) return -EINVAL; type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types)); ops = bpf_prog_types[type]; if (!ops) return -EINVAL; if (!bpf_prog_is_offloaded(prog->aux)) prog->aux->ops = ops; else prog->aux->ops = &bpf_offload_prog_ops; prog->type = type; return 0; } enum bpf_audit { BPF_AUDIT_LOAD, BPF_AUDIT_UNLOAD, BPF_AUDIT_MAX, }; static const char * const bpf_audit_str[BPF_AUDIT_MAX] = { [BPF_AUDIT_LOAD] = "LOAD", [BPF_AUDIT_UNLOAD] = "UNLOAD", }; static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) { struct audit_context *ctx = NULL; struct audit_buffer *ab; if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX)) return; if (audit_enabled == AUDIT_OFF) return; if (!in_irq() && !irqs_disabled()) ctx = audit_context(); ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); if (unlikely(!ab)) return; audit_log_format(ab, "prog-id=%u op=%s", prog->aux->id, bpf_audit_str[op]); audit_log_end(ab); } static int bpf_prog_alloc_id(struct bpf_prog *prog) { int id; idr_preload(GFP_KERNEL); spin_lock_bh(&prog_idr_lock); id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); if (id > 0) prog->aux->id = id; spin_unlock_bh(&prog_idr_lock); idr_preload_end(); /* id is in [1, INT_MAX) */ if (WARN_ON_ONCE(!id)) return -ENOSPC; return id > 0 ? 0 : id; } void bpf_prog_free_id(struct bpf_prog *prog) { unsigned long flags; /* cBPF to eBPF migrations are currently not in the idr store. * Offloaded programs are removed from the store when their device * disappears - even if someone grabs an fd to them they are unusable, * simply waiting for refcnt to drop to be freed. */ if (!prog->aux->id) return; spin_lock_irqsave(&prog_idr_lock, flags); idr_remove(&prog_idr, prog->aux->id); prog->aux->id = 0; spin_unlock_irqrestore(&prog_idr_lock, flags); } static void __bpf_prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); kvfree(aux->func_info); kfree(aux->func_info_aux); free_uid(aux->user); security_bpf_prog_free(aux->prog); bpf_prog_free(aux->prog); } static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) { bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); module_put(prog->aux->mod); kvfree(prog->aux->jited_linfo); kvfree(prog->aux->linfo); kfree(prog->aux->kfunc_tab); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); if (deferred) { if (prog->sleepable) call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); else call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } else { __bpf_prog_put_rcu(&prog->aux->rcu); } } static void bpf_prog_put_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; struct bpf_prog *prog; aux = container_of(work, struct bpf_prog_aux, work); prog = aux->prog; perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); bpf_prog_free_id(prog); __bpf_prog_put_noref(prog, true); } static void __bpf_prog_put(struct bpf_prog *prog) { struct bpf_prog_aux *aux = prog->aux; if (atomic64_dec_and_test(&aux->refcnt)) { if (in_irq() || irqs_disabled()) { INIT_WORK(&aux->work, bpf_prog_put_deferred); schedule_work(&aux->work); } else { bpf_prog_put_deferred(&aux->work); } } } void bpf_prog_put(struct bpf_prog *prog) { __bpf_prog_put(prog); } EXPORT_SYMBOL_GPL(bpf_prog_put); static int bpf_prog_release(struct inode *inode, struct file *filp) { struct bpf_prog *prog = filp->private_data; bpf_prog_put(prog); return 0; } struct bpf_prog_kstats { u64 nsecs; u64 cnt; u64 misses; }; void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog) { struct bpf_prog_stats *stats; unsigned int flags; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->misses); u64_stats_update_end_irqrestore(&stats->syncp, flags); } static void bpf_prog_get_stats(const struct bpf_prog *prog, struct bpf_prog_kstats *stats) { u64 nsecs = 0, cnt = 0, misses = 0; int cpu; for_each_possible_cpu(cpu) { const struct bpf_prog_stats *st; unsigned int start; u64 tnsecs, tcnt, tmisses; st = per_cpu_ptr(prog->stats, cpu); do { start = u64_stats_fetch_begin(&st->syncp); tnsecs = u64_stats_read(&st->nsecs); tcnt = u64_stats_read(&st->cnt); tmisses = u64_stats_read(&st->misses); } while (u64_stats_fetch_retry(&st->syncp, start)); nsecs += tnsecs; cnt += tcnt; misses += tmisses; } stats->nsecs = nsecs; stats->cnt = cnt; stats->misses = misses; } #ifdef CONFIG_PROC_FS static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_prog *prog = filp->private_data; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; struct bpf_prog_kstats stats; bpf_prog_get_stats(prog, &stats); bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "prog_type:\t%u\n" "prog_jited:\t%u\n" "prog_tag:\t%s\n" "memlock:\t%llu\n" "prog_id:\t%u\n" "run_time_ns:\t%llu\n" "run_cnt:\t%llu\n" "recursion_misses:\t%llu\n" "verified_insns:\t%u\n", prog->type, prog->jited, prog_tag, prog->pages * 1ULL << PAGE_SHIFT, prog->aux->id, stats.nsecs, stats.cnt, stats.misses, prog->aux->verified_insns); } #endif const struct file_operations bpf_prog_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_prog_show_fdinfo, #endif .release = bpf_prog_release, .read = bpf_dummy_read, .write = bpf_dummy_write, }; int bpf_prog_new_fd(struct bpf_prog *prog) { int ret; ret = security_bpf_prog(prog); if (ret < 0) return ret; return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); } void bpf_prog_add(struct bpf_prog *prog, int i) { atomic64_add(i, &prog->aux->refcnt); } EXPORT_SYMBOL_GPL(bpf_prog_add); void bpf_prog_sub(struct bpf_prog *prog, int i) { /* Only to be used for undoing previous bpf_prog_add() in some * error path. We still know that another entity in our call * path holds a reference to the program, thus atomic_sub() can * be safely used in such cases! */ WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == 0); } EXPORT_SYMBOL_GPL(bpf_prog_sub); void bpf_prog_inc(struct bpf_prog *prog) { atomic64_inc(&prog->aux->refcnt); } EXPORT_SYMBOL_GPL(bpf_prog_inc); /* prog_idr_lock should have been held */ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) { int refold; refold = atomic64_fetch_add_unless(&prog->aux->refcnt, 1, 0); if (!refold) return ERR_PTR(-ENOENT); return prog; } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); bool bpf_prog_get_ok(struct bpf_prog *prog, enum bpf_prog_type *attach_type, bool attach_drv) { /* not an attachment, just a refcount inc, always allow */ if (!attach_type) return true; if (prog->type != *attach_type) return false; if (bpf_prog_is_offloaded(prog->aux) && !attach_drv) return false; return true; } static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, bool attach_drv) { CLASS(fd, f)(ufd); struct bpf_prog *prog; if (fd_empty(f)) return ERR_PTR(-EBADF); if (fd_file(f)->f_op != &bpf_prog_fops) return ERR_PTR(-EINVAL); prog = fd_file(f)->private_data; if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) return ERR_PTR(-EINVAL); bpf_prog_inc(prog); return prog; } struct bpf_prog *bpf_prog_get(u32 ufd) { return __bpf_prog_get(ufd, NULL, false); } struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) { return __bpf_prog_get(ufd, &type, attach_drv); } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); /* Initially all BPF programs could be loaded w/o specifying * expected_attach_type. Later for some of them specifying expected_attach_type * at load time became required so that program could be validated properly. * Programs of types that are allowed to be loaded both w/ and w/o (for * backward compatibility) expected_attach_type, should have the default attach * type assigned to expected_attach_type for the latter case, so that it can be * validated later at attach time. * * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if * prog type requires it but has some attach types that have to be backward * compatible. */ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) { switch (attr->prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK: /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't * exist so checking for non-zero is the way to go here. */ if (!attr->expected_attach_type) attr->expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE; break; case BPF_PROG_TYPE_SK_REUSEPORT: if (!attr->expected_attach_type) attr->expected_attach_type = BPF_SK_REUSEPORT_SELECT; break; } } static int bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, struct btf *attach_btf, u32 btf_id, struct bpf_prog *dst_prog) { if (btf_id) { if (btf_id > BTF_MAX_TYPE) return -EINVAL; if (!attach_btf && !dst_prog) return -EINVAL; switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: break; default: return -EINVAL; } } if (attach_btf && (!btf_id || dst_prog)) return -EINVAL; if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING && prog_type != BPF_PROG_TYPE_EXT) return -EINVAL; switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK: switch (expected_attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return 0; default: return -EINVAL; } case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: switch (expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: return 0; default: return -EINVAL; } case BPF_PROG_TYPE_CGROUP_SKB: switch (expected_attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: return 0; default: return -EINVAL; } case BPF_PROG_TYPE_CGROUP_SOCKOPT: switch (expected_attach_type) { case BPF_CGROUP_SETSOCKOPT: case BPF_CGROUP_GETSOCKOPT: return 0; default: return -EINVAL; } case BPF_PROG_TYPE_SK_LOOKUP: if (expected_attach_type == BPF_SK_LOOKUP) return 0; return -EINVAL; case BPF_PROG_TYPE_SK_REUSEPORT: switch (expected_attach_type) { case BPF_SK_REUSEPORT_SELECT: case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE: return 0; default: return -EINVAL; } case BPF_PROG_TYPE_NETFILTER: if (expected_attach_type == BPF_NETFILTER) return 0; return -EINVAL; case BPF_PROG_TYPE_SYSCALL: case BPF_PROG_TYPE_EXT: if (expected_attach_type) return -EINVAL; fallthrough; default: return 0; } } static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) { switch (prog_type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_LWT_SEG6LOCAL: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_EXT: /* extends any prog */ case BPF_PROG_TYPE_NETFILTER: return true; case BPF_PROG_TYPE_CGROUP_SKB: /* always unpriv */ case BPF_PROG_TYPE_SK_REUSEPORT: /* equivalent to SOCKET_FILTER. need CAP_BPF only */ default: return false; } } static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) { switch (prog_type) { case BPF_PROG_TYPE_KPROBE: case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ case BPF_PROG_TYPE_EXT: /* extends any prog */ return true; default: return false; } } /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD prog_token_fd static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; struct btf *attach_btf = NULL; struct bpf_token *token = NULL; bool bpf_cap; int err; char license[128]; if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32 | BPF_F_XDP_HAS_FRAGS | BPF_F_XDP_DEV_BOUND_ONLY | BPF_F_TEST_REG_INVARIANTS | BPF_F_TOKEN_FD)) return -EINVAL; bpf_prog_load_fixup_attach_type(attr); if (attr->prog_flags & BPF_F_TOKEN_FD) { token = bpf_token_get_from_fd(attr->prog_token_fd); if (IS_ERR(token)) return PTR_ERR(token); /* if current token doesn't grant prog loading permissions, * then we can't use this token, so ignore it and rely on * system-wide capabilities checks */ if (!bpf_token_allow_cmd(token, BPF_PROG_LOAD) || !bpf_token_allow_prog_type(token, attr->prog_type, attr->expected_attach_type)) { bpf_token_put(token); token = NULL; } } bpf_cap = bpf_token_capable(token, CAP_BPF); err = -EPERM; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && !bpf_cap) goto put_token; /* Intent here is for unprivileged_bpf_disabled to block BPF program * creation for unprivileged users; other actions depend * on fd availability and access to bpffs, so are dependent on * object creation success. Even with unprivileged BPF disabled, * capability checks are still carried out for these * and other operations. */ if (sysctl_unprivileged_bpf_disabled && !bpf_cap) goto put_token; if (attr->insn_cnt == 0 || attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) { err = -E2BIG; goto put_token; } if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && !bpf_cap) goto put_token; if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN)) goto put_token; if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) goto put_token; /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog * or btf, we need to check which one it is */ if (attr->attach_prog_fd) { dst_prog = bpf_prog_get(attr->attach_prog_fd); if (IS_ERR(dst_prog)) { dst_prog = NULL; attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); if (IS_ERR(attach_btf)) { err = -EINVAL; goto put_token; } if (!btf_is_kernel(attach_btf)) { /* attaching through specifying bpf_prog's BTF * objects directly might be supported eventually */ btf_put(attach_btf); err = -ENOTSUPP; goto put_token; } } } else if (attr->attach_btf_id) { /* fall back to vmlinux BTF, if BTF type ID is specified */ attach_btf = bpf_get_btf_vmlinux(); if (IS_ERR(attach_btf)) { err = PTR_ERR(attach_btf); goto put_token; } if (!attach_btf) { err = -EINVAL; goto put_token; } btf_get(attach_btf); } if (bpf_prog_load_check_attach(type, attr->expected_attach_type, attach_btf, attr->attach_btf_id, dst_prog)) { if (dst_prog) bpf_prog_put(dst_prog); if (attach_btf) btf_put(attach_btf); err = -EINVAL; goto put_token; } /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) { if (dst_prog) bpf_prog_put(dst_prog); if (attach_btf) btf_put(attach_btf); err = -EINVAL; goto put_token; } prog->expected_attach_type = attr->expected_attach_type; prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE); prog->aux->attach_btf = attach_btf; prog->aux->attach_btf_id = attr->attach_btf_id; prog->aux->dst_prog = dst_prog; prog->aux->dev_bound = !!attr->prog_ifindex; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; /* move token into prog->aux, reuse taken refcnt */ prog->aux->token = token; token = NULL; prog->aux->user = get_current_user(); prog->len = attr->insn_cnt; err = -EFAULT; if (copy_from_bpfptr(prog->insns, make_bpfptr(attr->insns, uattr.is_kernel), bpf_prog_insn_size(prog)) != 0) goto free_prog; /* copy eBPF program license from user space */ if (strncpy_from_bpfptr(license, make_bpfptr(attr->license, uattr.is_kernel), sizeof(license) - 1) < 0) goto free_prog; license[sizeof(license) - 1] = 0; /* eBPF programs must be GPL compatible to use GPL-ed functions */ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; prog->orig_prog = NULL; prog->jited = 0; atomic64_set(&prog->aux->refcnt, 1); if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_dev_bound_init(prog, attr); if (err) goto free_prog; } if (type == BPF_PROG_TYPE_EXT && dst_prog && bpf_prog_is_dev_bound(dst_prog->aux)) { err = bpf_prog_dev_bound_inherit(prog, dst_prog); if (err) goto free_prog; } /* * Bookkeeping for managing the program attachment chain. * * It might be tempting to set attach_tracing_prog flag at the attachment * time, but this will not prevent from loading bunch of tracing prog * first, then attach them one to another. * * The flag attach_tracing_prog is set for the whole program lifecycle, and * doesn't have to be cleared in bpf_tracing_link_release, since tracing * programs cannot change attachment target. */ if (type == BPF_PROG_TYPE_TRACING && dst_prog && dst_prog->type == BPF_PROG_TYPE_TRACING) { prog->aux->attach_tracing_prog = true; } /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) goto free_prog; prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, sizeof(attr->prog_name)); if (err < 0) goto free_prog; err = security_bpf_prog_load(prog, attr, token); if (err) goto free_prog_sec; /* run eBPF verifier */ err = bpf_check(&prog, attr, uattr, uattr_size); if (err < 0) goto free_used_maps; prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; err = bpf_prog_alloc_id(prog); if (err) goto free_used_maps; /* Upon success of bpf_prog_alloc_id(), the BPF prog is * effectively publicly exposed. However, retrieving via * bpf_prog_get_fd_by_id() will take another reference, * therefore it cannot be gone underneath us. * * Only for the time /after/ successful bpf_prog_new_fd() * and before returning to userspace, we might just hold * one reference and any parallel close on that fd could * rip everything out. Hence, below notifications must * happen before bpf_prog_new_fd(). * * Also, any failure handling from this point onwards must * be using bpf_prog_put() given the program is exposed. */ bpf_prog_kallsyms_add(prog); perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); bpf_audit_prog(prog, BPF_AUDIT_LOAD); err = bpf_prog_new_fd(prog); if (err < 0) bpf_prog_put(prog); return err; free_used_maps: /* In case we have subprogs, we need to wait for a grace * period before we can tear down JIT memory since symbols * are already exposed under kallsyms. */ __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); return err; free_prog_sec: security_bpf_prog_free(prog); free_prog: free_uid(prog->aux->user); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); bpf_prog_free(prog); put_token: bpf_token_put(token); return err; } #define BPF_OBJ_LAST_FIELD path_fd static int bpf_obj_pin(const union bpf_attr *attr) { int path_fd; if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_PATH_FD) return -EINVAL; /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) return -EINVAL; path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; return bpf_obj_pin_user(attr->bpf_fd, path_fd, u64_to_user_ptr(attr->pathname)); } static int bpf_obj_get(const union bpf_attr *attr) { int path_fd; if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || attr->file_flags & ~(BPF_OBJ_FLAG_MASK | BPF_F_PATH_FD)) return -EINVAL; /* path_fd has to be accompanied by BPF_F_PATH_FD flag */ if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd) return -EINVAL; path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD; return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname), attr->file_flags); } /* bpf_link_init_sleepable() allows to specify whether BPF link itself has * "sleepable" semantics, which normally would mean that BPF link's attach * hook can dereference link or link's underlying program for some time after * detachment due to RCU Tasks Trace-based lifetime protection scheme. * BPF program itself can be non-sleepable, yet, because it's transitively * reachable through BPF link, its freeing has to be delayed until after RCU * Tasks Trace GP. */ void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, bool sleepable) { WARN_ON(ops->dealloc && ops->dealloc_deferred); atomic64_set(&link->refcnt, 1); link->type = type; link->sleepable = sleepable; link->id = 0; link->ops = ops; link->prog = prog; } void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) { bpf_link_init_sleepable(link, type, ops, prog, false); } static void bpf_link_free_id(int id) { if (!id) return; spin_lock_bh(&link_idr_lock); idr_remove(&link_idr, id); spin_unlock_bh(&link_idr_lock); } /* Clean up bpf_link and corresponding anon_inode file and FD. After * anon_inode is created, bpf_link can't be just kfree()'d due to deferred * anon_inode's release() call. This helper marks bpf_link as * defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt * is not decremented, it's the responsibility of a calling code that failed * to complete bpf_link initialization. * This helper eventually calls link's dealloc callback, but does not call * link's release callback. */ void bpf_link_cleanup(struct bpf_link_primer *primer) { primer->link->prog = NULL; bpf_link_free_id(primer->id); fput(primer->file); put_unused_fd(primer->fd); } void bpf_link_inc(struct bpf_link *link) { atomic64_inc(&link->refcnt); } static void bpf_link_dealloc(struct bpf_link *link) { /* now that we know that bpf_link itself can't be reached, put underlying BPF program */ if (link->prog) bpf_prog_put(link->prog); /* free bpf_link and its containing memory */ if (link->ops->dealloc_deferred) link->ops->dealloc_deferred(link); else link->ops->dealloc(link); } static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) { struct bpf_link *link = container_of(rcu, struct bpf_link, rcu); bpf_link_dealloc(link); } static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) { if (rcu_trace_implies_rcu_gp()) bpf_link_defer_dealloc_rcu_gp(rcu); else call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp); } /* bpf_link_free is guaranteed to be called from process context */ static void bpf_link_free(struct bpf_link *link) { const struct bpf_link_ops *ops = link->ops; bpf_link_free_id(link->id); /* detach BPF program, clean up used resources */ if (link->prog) ops->release(link); if (ops->dealloc_deferred) { /* Schedule BPF link deallocation, which will only then * trigger putting BPF program refcount. * If underlying BPF program is sleepable or BPF link's target * attach hookpoint is sleepable or otherwise requires RCU GPs * to ensure link and its underlying BPF program is not * reachable anymore, we need to first wait for RCU tasks * trace sync, and then go through "classic" RCU grace period */ if (link->sleepable || (link->prog && link->prog->sleepable)) call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); else call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); } else if (ops->dealloc) { bpf_link_dealloc(link); } } static void bpf_link_put_deferred(struct work_struct *work) { struct bpf_link *link = container_of(work, struct bpf_link, work); bpf_link_free(link); } /* bpf_link_put might be called from atomic context. It needs to be called * from sleepable context in order to acquire sleeping locks during the process. */ void bpf_link_put(struct bpf_link *link) { if (!atomic64_dec_and_test(&link->refcnt)) return; INIT_WORK(&link->work, bpf_link_put_deferred); schedule_work(&link->work); } EXPORT_SYMBOL(bpf_link_put); static void bpf_link_put_direct(struct bpf_link *link) { if (!atomic64_dec_and_test(&link->refcnt)) return; bpf_link_free(link); } static int bpf_link_release(struct inode *inode, struct file *filp) { struct bpf_link *link = filp->private_data; bpf_link_put_direct(link); return 0; } #ifdef CONFIG_PROC_FS #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_MAP_TYPE(_id, _ops) #define BPF_LINK_TYPE(_id, _name) [_id] = #_name, static const char *bpf_link_type_strs[] = { [BPF_LINK_TYPE_UNSPEC] = "<invalid>", #include <linux/bpf_types.h> }; #undef BPF_PROG_TYPE #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; enum bpf_link_type type = link->type; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); } else { WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); seq_printf(m, "link_type:\t<%u>\n", type); } seq_printf(m, "link_id:\t%u\n", link->id); if (prog) { bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "prog_tag:\t%s\n" "prog_id:\t%u\n", prog_tag, prog->aux->id); } if (link->ops->show_fdinfo) link->ops->show_fdinfo(link, m); } #endif static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts) { struct bpf_link *link = file->private_data; return link->ops->poll(file, pts); } static const struct file_operations bpf_link_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_link_show_fdinfo, #endif .release = bpf_link_release, .read = bpf_dummy_read, .write = bpf_dummy_write, }; static const struct file_operations bpf_link_fops_poll = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_link_show_fdinfo, #endif .release = bpf_link_release, .read = bpf_dummy_read, .write = bpf_dummy_write, .poll = bpf_link_poll, }; static int bpf_link_alloc_id(struct bpf_link *link) { int id; idr_preload(GFP_KERNEL); spin_lock_bh(&link_idr_lock); id = idr_alloc_cyclic(&link_idr, link, 1, INT_MAX, GFP_ATOMIC); spin_unlock_bh(&link_idr_lock); idr_preload_end(); return id; } /* Prepare bpf_link to be exposed to user-space by allocating anon_inode file, * reserving unused FD and allocating ID from link_idr. This is to be paired * with bpf_link_settle() to install FD and ID and expose bpf_link to * user-space, if bpf_link is successfully attached. If not, bpf_link and * pre-allocated resources are to be freed with bpf_cleanup() call. All the * transient state is passed around in struct bpf_link_primer. * This is preferred way to create and initialize bpf_link, especially when * there are complicated and expensive operations in between creating bpf_link * itself and attaching it to BPF hook. By using bpf_link_prime() and * bpf_link_settle() kernel code using bpf_link doesn't have to perform * expensive (and potentially failing) roll back operations in a rare case * that file, FD, or ID can't be allocated. */ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) { struct file *file; int fd, id; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) return fd; id = bpf_link_alloc_id(link); if (id < 0) { put_unused_fd(fd); return id; } file = anon_inode_getfile("bpf_link", link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, link, O_CLOEXEC); if (IS_ERR(file)) { bpf_link_free_id(id); put_unused_fd(fd); return PTR_ERR(file); } primer->link = link; primer->file = file; primer->fd = fd; primer->id = id; return 0; } int bpf_link_settle(struct bpf_link_primer *primer) { /* make bpf_link fetchable by ID */ spin_lock_bh(&link_idr_lock); primer->link->id = primer->id; spin_unlock_bh(&link_idr_lock); /* make bpf_link fetchable by FD */ fd_install(primer->fd, primer->file); /* pass through installed FD */ return primer->fd; } int bpf_link_new_fd(struct bpf_link *link) { return anon_inode_getfd("bpf-link", link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, link, O_CLOEXEC); } struct bpf_link *bpf_link_get_from_fd(u32 ufd) { CLASS(fd, f)(ufd); struct bpf_link *link; if (fd_empty(f)) return ERR_PTR(-EBADF); if (fd_file(f)->f_op != &bpf_link_fops && fd_file(f)->f_op != &bpf_link_fops_poll) return ERR_PTR(-EINVAL); link = fd_file(f)->private_data; bpf_link_inc(link); return link; } EXPORT_SYMBOL(bpf_link_get_from_fd); static void bpf_tracing_link_release(struct bpf_link *link) { struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, tr_link->trampoline, tr_link->tgt_prog)); bpf_trampoline_put(tr_link->trampoline); /* tgt_prog is NULL if target is a kernel function */ if (tr_link->tgt_prog) bpf_prog_put(tr_link->tgt_prog); } static void bpf_tracing_link_dealloc(struct bpf_link *link) { struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); kfree(tr_link); } static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); u32 target_btf_id, target_obj_id; bpf_trampoline_unpack_key(tr_link->trampoline->key, &target_obj_id, &target_btf_id); seq_printf(seq, "attach_type:\t%d\n" "target_obj_id:\t%u\n" "target_btf_id:\t%u\n", tr_link->attach_type, target_obj_id, target_btf_id); } static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); info->tracing.attach_type = tr_link->attach_type; bpf_trampoline_unpack_key(tr_link->trampoline->key, &info->tracing.target_obj_id, &info->tracing.target_btf_id); return 0; } static const struct bpf_link_ops bpf_tracing_link_lops = { .release = bpf_tracing_link_release, .dealloc = bpf_tracing_link_dealloc, .show_fdinfo = bpf_tracing_link_show_fdinfo, .fill_link_info = bpf_tracing_link_fill_link_info, }; static int bpf_tracing_prog_attach(struct bpf_prog *prog, int tgt_prog_fd, u32 btf_id, u64 bpf_cookie) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; struct bpf_trampoline *tr = NULL; struct bpf_tracing_link *link; u64 key = 0; int err; switch (prog->type) { case BPF_PROG_TYPE_TRACING: if (prog->expected_attach_type != BPF_TRACE_FENTRY && prog->expected_attach_type != BPF_TRACE_FEXIT && prog->expected_attach_type != BPF_MODIFY_RETURN) { err = -EINVAL; goto out_put_prog; } break; case BPF_PROG_TYPE_EXT: if (prog->expected_attach_type != 0) { err = -EINVAL; goto out_put_prog; } break; case BPF_PROG_TYPE_LSM: if (prog->expected_attach_type != BPF_LSM_MAC) { err = -EINVAL; goto out_put_prog; } break; default: err = -EINVAL; goto out_put_prog; } if (!!tgt_prog_fd != !!btf_id) { err = -EINVAL; goto out_put_prog; } if (tgt_prog_fd) { /* * For now we only allow new targets for BPF_PROG_TYPE_EXT. If this * part would be changed to implement the same for * BPF_PROG_TYPE_TRACING, do not forget to update the way how * attach_tracing_prog flag is set. */ if (prog->type != BPF_PROG_TYPE_EXT) { err = -EINVAL; goto out_put_prog; } tgt_prog = bpf_prog_get(tgt_prog_fd); if (IS_ERR(tgt_prog)) { err = PTR_ERR(tgt_prog); tgt_prog = NULL; goto out_put_prog; } key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto out_put_prog; } bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, &bpf_tracing_link_lops, prog); link->attach_type = prog->expected_attach_type; link->link.cookie = bpf_cookie; mutex_lock(&prog->aux->dst_mutex); /* There are a few possible cases here: * * - if prog->aux->dst_trampoline is set, the program was just loaded * and not yet attached to anything, so we can use the values stored * in prog->aux * * - if prog->aux->dst_trampoline is NULL, the program has already been * attached to a target and its initial target was cleared (below) * * - if tgt_prog != NULL, the caller specified tgt_prog_fd + * target_btf_id using the link_create API. * * - if tgt_prog == NULL when this function was called using the old * raw_tracepoint_open API, and we need a target from prog->aux * * - if prog->aux->dst_trampoline and tgt_prog is NULL, the program * was detached and is going for re-attachment. * * - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf * are NULL, then program was already attached and user did not provide * tgt_prog_fd so we have no way to find out or create trampoline */ if (!prog->aux->dst_trampoline && !tgt_prog) { /* * Allow re-attach for TRACING and LSM programs. If it's * currently linked, bpf_trampoline_link_prog will fail. * EXT programs need to specify tgt_prog_fd, so they * re-attach in separate code path. */ if (prog->type != BPF_PROG_TYPE_TRACING && prog->type != BPF_PROG_TYPE_LSM) { err = -EINVAL; goto out_unlock; } /* We can allow re-attach only if we have valid attach_btf. */ if (!prog->aux->attach_btf) { err = -EINVAL; goto out_unlock; } btf_id = prog->aux->attach_btf_id; key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, btf_id); } if (!prog->aux->dst_trampoline || (key && key != prog->aux->dst_trampoline->key)) { /* If there is no saved target, or the specified target is * different from the destination specified at load time, we * need a new trampoline and a check for compatibility */ struct bpf_attach_target_info tgt_info = {}; err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id, &tgt_info); if (err) goto out_unlock; if (tgt_info.tgt_mod) { module_put(prog->aux->mod); prog->aux->mod = tgt_info.tgt_mod; } tr = bpf_trampoline_get(key, &tgt_info); if (!tr) { err = -ENOMEM; goto out_unlock; } } else { /* The caller didn't specify a target, or the target was the * same as the destination supplied during program load. This * means we can reuse the trampoline and reference from program * load time, and there is no need to allocate a new one. This * can only happen once for any program, as the saved values in * prog->aux are cleared below. */ tr = prog->aux->dst_trampoline; tgt_prog = prog->aux->dst_prog; } err = bpf_link_prime(&link->link.link, &link_primer); if (err) goto out_unlock; err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog); if (err) { bpf_link_cleanup(&link_primer); link = NULL; goto out_unlock; } link->tgt_prog = tgt_prog; link->trampoline = tr; /* Always clear the trampoline and target prog from prog->aux to make * sure the original attach destination is not kept alive after a * program is (re-)attached to another target. */ if (prog->aux->dst_prog && (tgt_prog_fd || tr != prog->aux->dst_trampoline)) /* got extra prog ref from syscall, or attaching to different prog */ bpf_prog_put(prog->aux->dst_prog); if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline) /* we allocated a new trampoline, so free the old one */ bpf_trampoline_put(prog->aux->dst_trampoline); prog->aux->dst_prog = NULL; prog->aux->dst_trampoline = NULL; mutex_unlock(&prog->aux->dst_mutex); return bpf_link_settle(&link_primer); out_unlock: if (tr && tr != prog->aux->dst_trampoline) bpf_trampoline_put(tr); mutex_unlock(&prog->aux->dst_mutex); kfree(link); out_put_prog: if (tgt_prog_fd && tgt_prog) bpf_prog_put(tgt_prog); return err; } static void bpf_raw_tp_link_release(struct bpf_link *link) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); bpf_probe_unregister(raw_tp->btp, raw_tp); bpf_put_raw_tracepoint(raw_tp->btp); } static void bpf_raw_tp_link_dealloc(struct bpf_link *link) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); kfree(raw_tp); } static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_raw_tp_link *raw_tp_link = container_of(link, struct bpf_raw_tp_link, link); seq_printf(seq, "tp_name:\t%s\n", raw_tp_link->btp->tp->name); } static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen, u32 len) { if (ulen >= len + 1) { if (copy_to_user(ubuf, buf, len + 1)) return -EFAULT; } else { char zero = '\0'; if (copy_to_user(ubuf, buf, ulen - 1)) return -EFAULT; if (put_user(zero, ubuf + ulen - 1)) return -EFAULT; return -ENOSPC; } return 0; } static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_raw_tp_link *raw_tp_link = container_of(link, struct bpf_raw_tp_link, link); char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name); const char *tp_name = raw_tp_link->btp->tp->name; u32 ulen = info->raw_tracepoint.tp_name_len; size_t tp_len = strlen(tp_name); if (!ulen ^ !ubuf) return -EINVAL; info->raw_tracepoint.tp_name_len = tp_len + 1; if (!ubuf) return 0; return bpf_copy_to_user(ubuf, tp_name, ulen, tp_len); } static const struct bpf_link_ops bpf_raw_tp_link_lops = { .release = bpf_raw_tp_link_release, .dealloc_deferred = bpf_raw_tp_link_dealloc, .show_fdinfo = bpf_raw_tp_link_show_fdinfo, .fill_link_info = bpf_raw_tp_link_fill_link_info, }; #ifdef CONFIG_PERF_EVENTS struct bpf_perf_link { struct bpf_link link; struct file *perf_file; }; static void bpf_perf_link_release(struct bpf_link *link) { struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); struct perf_event *event = perf_link->perf_file->private_data; perf_event_free_bpf_prog(event); fput(perf_link->perf_file); } static void bpf_perf_link_dealloc(struct bpf_link *link) { struct bpf_perf_link *perf_link = container_of(link, struct bpf_perf_link, link); kfree(perf_link); } static int bpf_perf_link_fill_common(const struct perf_event *event, char __user *uname, u32 *ulenp, u64 *probe_offset, u64 *probe_addr, u32 *fd_type, unsigned long *missed) { const char *buf; u32 prog_id, ulen; size_t len; int err; ulen = *ulenp; if (!ulen ^ !uname) return -EINVAL; err = bpf_get_perf_event_info(event, &prog_id, fd_type, &buf, probe_offset, probe_addr, missed); if (err) return err; if (buf) { len = strlen(buf); *ulenp = len + 1; } else { *ulenp = 1; } if (!uname) return 0; if (buf) { err = bpf_copy_to_user(uname, buf, ulen, len); if (err) return err; } else { char zero = '\0'; if (put_user(zero, uname)) return -EFAULT; } return 0; } #ifdef CONFIG_KPROBE_EVENTS static int bpf_perf_link_fill_kprobe(const struct perf_event *event, struct bpf_link_info *info) { unsigned long missed; char __user *uname; u64 addr, offset; u32 ulen, type; int err; uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); ulen = info->perf_event.kprobe.name_len; err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, &missed); if (err) return err; if (type == BPF_FD_TYPE_KRETPROBE) info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; else info->perf_event.type = BPF_PERF_EVENT_KPROBE; info->perf_event.kprobe.name_len = ulen; info->perf_event.kprobe.offset = offset; info->perf_event.kprobe.missed = missed; if (!kallsyms_show_value(current_cred())) addr = 0; info->perf_event.kprobe.addr = addr; info->perf_event.kprobe.cookie = event->bpf_cookie; return 0; } #endif #ifdef CONFIG_UPROBE_EVENTS static int bpf_perf_link_fill_uprobe(const struct perf_event *event, struct bpf_link_info *info) { char __user *uname; u64 addr, offset; u32 ulen, type; int err; uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, NULL); if (err) return err; if (type == BPF_FD_TYPE_URETPROBE) info->perf_event.type = BPF_PERF_EVENT_URETPROBE; else info->perf_event.type = BPF_PERF_EVENT_UPROBE; info->perf_event.uprobe.name_len = ulen; info->perf_event.uprobe.offset = offset; info->perf_event.uprobe.cookie = event->bpf_cookie; return 0; } #endif static int bpf_perf_link_fill_probe(const struct perf_event *event, struct bpf_link_info *info) { #ifdef CONFIG_KPROBE_EVENTS if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE) return bpf_perf_link_fill_kprobe(event, info); #endif #ifdef CONFIG_UPROBE_EVENTS if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE) return bpf_perf_link_fill_uprobe(event, info); #endif return -EOPNOTSUPP; } static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, struct bpf_link_info *info) { char __user *uname; u32 ulen; int err; uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); ulen = info->perf_event.tracepoint.name_len; err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL); if (err) return err; info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; info->perf_event.tracepoint.name_len = ulen; info->perf_event.tracepoint.cookie = event->bpf_cookie; return 0; } static int bpf_perf_link_fill_perf_event(const struct perf_event *event, struct bpf_link_info *info) { info->perf_event.event.type = event->attr.type; info->perf_event.event.config = event->attr.config; info->perf_event.event.cookie = event->bpf_cookie; info->perf_event.type = BPF_PERF_EVENT_EVENT; return 0; } static int bpf_perf_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_perf_link *perf_link; const struct perf_event *event; perf_link = container_of(link, struct bpf_perf_link, link); event = perf_get_event(perf_link->perf_file); if (IS_ERR(event)) return PTR_ERR(event); switch (event->prog->type) { case BPF_PROG_TYPE_PERF_EVENT: return bpf_perf_link_fill_perf_event(event, info); case BPF_PROG_TYPE_TRACEPOINT: return bpf_perf_link_fill_tracepoint(event, info); case BPF_PROG_TYPE_KPROBE: return bpf_perf_link_fill_probe(event, info); default: return -EOPNOTSUPP; } } static const struct bpf_link_ops bpf_perf_link_lops = { .release = bpf_perf_link_release, .dealloc = bpf_perf_link_dealloc, .fill_link_info = bpf_perf_link_fill_link_info, }; static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; struct bpf_perf_link *link; struct perf_event *event; struct file *perf_file; int err; if (attr->link_create.flags) return -EINVAL; perf_file = perf_event_get(attr->link_create.target_fd); if (IS_ERR(perf_file)) return PTR_ERR(perf_file); link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto out_put_file; } bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog); link->perf_file = perf_file; err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto out_put_file; } event = perf_file->private_data; err = perf_event_set_bpf_prog(event, prog, attr->link_create.perf_event.bpf_cookie); if (err) { bpf_link_cleanup(&link_primer); goto out_put_file; } /* perf_event_set_bpf_prog() doesn't take its own refcnt on prog */ bpf_prog_inc(prog); return bpf_link_settle(&link_primer); out_put_file: fput(perf_file); return err; } #else static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EOPNOTSUPP; } #endif /* CONFIG_PERF_EVENTS */ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, const char __user *user_tp_name, u64 cookie) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; struct bpf_raw_event_map *btp; const char *tp_name; char buf[128]; int err; switch (prog->type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: if (user_tp_name) /* The attach point for this category of programs * should be specified via btf_id during program load. */ return -EINVAL; if (prog->type == BPF_PROG_TYPE_TRACING && prog->expected_attach_type == BPF_TRACE_RAW_TP) { tp_name = prog->aux->attach_func_name; break; } return bpf_tracing_prog_attach(prog, 0, 0, 0); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) return -EFAULT; buf[sizeof(buf) - 1] = 0; tp_name = buf; break; default: return -EINVAL; } btp = bpf_get_raw_tracepoint(tp_name); if (!btp) return -ENOENT; link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto out_put_btp; } bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, &bpf_raw_tp_link_lops, prog, tracepoint_is_faultable(btp->tp)); link->btp = btp; link->cookie = cookie; err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); goto out_put_btp; } err = bpf_probe_register(link->btp, link); if (err) { bpf_link_cleanup(&link_primer); goto out_put_btp; } return bpf_link_settle(&link_primer); out_put_btp: bpf_put_raw_tracepoint(btp); return err; } #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie static int bpf_raw_tracepoint_open(const union bpf_attr *attr) { struct bpf_prog *prog; void __user *tp_name; __u64 cookie; int fd; if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN)) return -EINVAL; prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); cookie = attr->raw_tracepoint.cookie; fd = bpf_raw_tp_link_attach(prog, tp_name, cookie); if (fd < 0) bpf_prog_put(prog); return fd; } static enum bpf_prog_type attach_type_to_prog_type(enum bpf_attach_type attach_type) { switch (attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: return BPF_PROG_TYPE_CGROUP_SKB; case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: return BPF_PROG_TYPE_CGROUP_SOCK; case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: return BPF_PROG_TYPE_CGROUP_SOCK_ADDR; case BPF_CGROUP_SOCK_OPS: return BPF_PROG_TYPE_SOCK_OPS; case BPF_CGROUP_DEVICE: return BPF_PROG_TYPE_CGROUP_DEVICE; case BPF_SK_MSG_VERDICT: return BPF_PROG_TYPE_SK_MSG; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: case BPF_SK_SKB_VERDICT: return BPF_PROG_TYPE_SK_SKB; case BPF_LIRC_MODE2: return BPF_PROG_TYPE_LIRC_MODE2; case BPF_FLOW_DISSECTOR: return BPF_PROG_TYPE_FLOW_DISSECTOR; case BPF_CGROUP_SYSCTL: return BPF_PROG_TYPE_CGROUP_SYSCTL; case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; case BPF_TRACE_ITER: case BPF_TRACE_RAW_TP: case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: return BPF_PROG_TYPE_LSM; case BPF_SK_LOOKUP: return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: return BPF_PROG_TYPE_XDP; case BPF_LSM_CGROUP: return BPF_PROG_TYPE_LSM; case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: case BPF_NETKIT_PRIMARY: case BPF_NETKIT_PEER: return BPF_PROG_TYPE_SCHED_CLS; default: return BPF_PROG_TYPE_UNSPEC; } } static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { enum bpf_prog_type ptype; switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_SK_LOOKUP: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: if (!bpf_token_capable(prog->aux->token, CAP_NET_ADMIN)) /* cg-skb progs can be loaded by unpriv user. * check permissions at attach time. */ return -EPERM; ptype = attach_type_to_prog_type(attach_type); if (prog->type != ptype) return -EINVAL; return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; case BPF_PROG_TYPE_EXT: return 0; case BPF_PROG_TYPE_NETFILTER: if (attach_type != BPF_NETFILTER) return -EINVAL; return 0; case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: if (attach_type != BPF_PERF_EVENT) return -EINVAL; return 0; case BPF_PROG_TYPE_KPROBE: if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_MULTI) return -EINVAL; if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION && attach_type != BPF_TRACE_KPROBE_SESSION) return -EINVAL; if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_MULTI) return -EINVAL; if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION && attach_type != BPF_TRACE_UPROBE_SESSION) return -EINVAL; if (attach_type != BPF_PERF_EVENT && attach_type != BPF_TRACE_KPROBE_MULTI && attach_type != BPF_TRACE_KPROBE_SESSION && attach_type != BPF_TRACE_UPROBE_MULTI && attach_type != BPF_TRACE_UPROBE_SESSION) return -EINVAL; return 0; case BPF_PROG_TYPE_SCHED_CLS: if (attach_type != BPF_TCX_INGRESS && attach_type != BPF_TCX_EGRESS && attach_type != BPF_NETKIT_PRIMARY && attach_type != BPF_NETKIT_PEER) return -EINVAL; return 0; default: ptype = attach_type_to_prog_type(attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) return -EINVAL; return 0; } } #define BPF_PROG_ATTACH_LAST_FIELD expected_revision #define BPF_F_ATTACH_MASK_BASE \ (BPF_F_ALLOW_OVERRIDE | \ BPF_F_ALLOW_MULTI | \ BPF_F_REPLACE) #define BPF_F_ATTACH_MASK_MPROG \ (BPF_F_REPLACE | \ BPF_F_BEFORE | \ BPF_F_AFTER | \ BPF_F_ID | \ BPF_F_LINK) static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; struct bpf_prog *prog; int ret; if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; ptype = attach_type_to_prog_type(attr->attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC) return -EINVAL; if (bpf_mprog_supported(ptype)) { if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) return -EINVAL; } else { if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE) return -EINVAL; if (attr->relative_fd || attr->expected_revision) return -EINVAL; } prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { bpf_prog_put(prog); return -EINVAL; } switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: ret = sock_map_get_from_fd(attr, prog); break; case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: ret = netns_bpf_prog_attach(attr, prog); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_LSM: if (ptype == BPF_PROG_TYPE_LSM && prog->expected_attach_type != BPF_LSM_CGROUP) ret = -EINVAL; else ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; case BPF_PROG_TYPE_SCHED_CLS: if (attr->attach_type == BPF_TCX_INGRESS || attr->attach_type == BPF_TCX_EGRESS) ret = tcx_prog_attach(attr, prog); else ret = netkit_prog_attach(attr, prog); break; default: ret = -EINVAL; } if (ret) bpf_prog_put(prog); return ret; } #define BPF_PROG_DETACH_LAST_FIELD expected_revision static int bpf_prog_detach(const union bpf_attr *attr) { struct bpf_prog *prog = NULL; enum bpf_prog_type ptype; int ret; if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; ptype = attach_type_to_prog_type(attr->attach_type); if (bpf_mprog_supported(ptype)) { if (ptype == BPF_PROG_TYPE_UNSPEC) return -EINVAL; if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) return -EINVAL; if (attr->attach_bpf_fd) { prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); } } else if (attr->attach_flags || attr->relative_fd || attr->expected_revision) { return -EINVAL; } switch (ptype) { case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_SKB: ret = sock_map_prog_detach(attr, ptype); break; case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_detach(attr); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: ret = netns_bpf_prog_detach(attr, ptype); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_LSM: ret = cgroup_bpf_prog_detach(attr, ptype); break; case BPF_PROG_TYPE_SCHED_CLS: if (attr->attach_type == BPF_TCX_INGRESS || attr->attach_type == BPF_TCX_EGRESS) ret = tcx_prog_detach(attr, prog); else ret = netkit_prog_detach(attr, prog); break; default: ret = -EINVAL; } if (prog) bpf_prog_put(prog); return ret; } #define BPF_PROG_QUERY_LAST_FIELD query.revision static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { if (!bpf_net_capable()) return -EPERM; if (CHECK_ATTR(BPF_PROG_QUERY)) return -EINVAL; if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) return -EINVAL; switch (attr->query.attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET_SOCK_RELEASE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UNIX_CONNECT: case BPF_CGROUP_INET4_GETPEERNAME: case BPF_CGROUP_INET6_GETPEERNAME: case BPF_CGROUP_UNIX_GETPEERNAME: case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UNIX_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UNIX_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_UNIX_RECVMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: case BPF_LSM_CGROUP: return cgroup_bpf_prog_query(attr, uattr); case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: case BPF_SK_MSG_VERDICT: case BPF_SK_SKB_VERDICT: return sock_map_bpf_prog_query(attr, uattr); case BPF_TCX_INGRESS: case BPF_TCX_EGRESS: return tcx_prog_query(attr, uattr); case BPF_NETKIT_PRIMARY: case BPF_NETKIT_PEER: return netkit_prog_query(attr, uattr); default: return -EINVAL; } } #define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_prog *prog; int ret = -ENOTSUPP; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; if ((attr->test.ctx_size_in && !attr->test.ctx_in) || (!attr->test.ctx_size_in && attr->test.ctx_in)) return -EINVAL; if ((attr->test.ctx_size_out && !attr->test.ctx_out) || (!attr->test.ctx_size_out && attr->test.ctx_out)) return -EINVAL; prog = bpf_prog_get(attr->test.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); if (prog->aux->ops->test_run) ret = prog->aux->ops->test_run(prog, attr, uattr); bpf_prog_put(prog); return ret; } #define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id static int bpf_obj_get_next_id(const union bpf_attr *attr, union bpf_attr __user *uattr, struct idr *idr, spinlock_t *lock) { u32 next_id = attr->start_id; int err = 0; if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; next_id++; spin_lock_bh(lock); if (!idr_get_next(idr, &next_id)) err = -ENOENT; spin_unlock_bh(lock); if (!err) err = put_user(next_id, &uattr->next_id); return err; } struct bpf_map *bpf_map_get_curr_or_next(u32 *id) { struct bpf_map *map; spin_lock_bh(&map_idr_lock); again: map = idr_get_next(&map_idr, id); if (map) { map = __bpf_map_inc_not_zero(map, false); if (IS_ERR(map)) { (*id)++; goto again; } } spin_unlock_bh(&map_idr_lock); return map; } struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id) { struct bpf_prog *prog; spin_lock_bh(&prog_idr_lock); again: prog = idr_get_next(&prog_idr, id); if (prog) { prog = bpf_prog_inc_not_zero(prog); if (IS_ERR(prog)) { (*id)++; goto again; } } spin_unlock_bh(&prog_idr_lock); return prog; } #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id struct bpf_prog *bpf_prog_by_id(u32 id) { struct bpf_prog *prog; if (!id) return ERR_PTR(-ENOENT); spin_lock_bh(&prog_idr_lock); prog = idr_find(&prog_idr, id); if (prog) prog = bpf_prog_inc_not_zero(prog); else prog = ERR_PTR(-ENOENT); spin_unlock_bh(&prog_idr_lock); return prog; } static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) { struct bpf_prog *prog; u32 id = attr->prog_id; int fd; if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; prog = bpf_prog_by_id(id); if (IS_ERR(prog)) return PTR_ERR(prog); fd = bpf_prog_new_fd(prog); if (fd < 0) bpf_prog_put(prog); return fd; } #define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags static int bpf_map_get_fd_by_id(const union bpf_attr *attr) { struct bpf_map *map; u32 id = attr->map_id; int f_flags; int fd; if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) || attr->open_flags & ~BPF_OBJ_FLAG_MASK) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; f_flags = bpf_get_file_flag(attr->open_flags); if (f_flags < 0) return f_flags; spin_lock_bh(&map_idr_lock); map = idr_find(&map_idr, id); if (map) map = __bpf_map_inc_not_zero(map, true); else map = ERR_PTR(-ENOENT); spin_unlock_bh(&map_idr_lock); if (IS_ERR(map)) return PTR_ERR(map); fd = bpf_map_new_fd(map, f_flags); if (fd < 0) bpf_map_put_with_uref(map); return fd; } static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, unsigned long addr, u32 *off, u32 *type) { const struct bpf_map *map; int i; mutex_lock(&prog->aux->used_maps_mutex); for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { map = prog->aux->used_maps[i]; if (map == (void *)addr) { *type = BPF_PSEUDO_MAP_FD; goto out; } if (!map->ops->map_direct_value_meta) continue; if (!map->ops->map_direct_value_meta(map, addr, off)) { *type = BPF_PSEUDO_MAP_VALUE; goto out; } } map = NULL; out: mutex_unlock(&prog->aux->used_maps_mutex); return map; } static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, const struct cred *f_cred) { const struct bpf_map *map; struct bpf_insn *insns; u32 off, type; u64 imm; u8 code; int i; insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), GFP_USER); if (!insns) return insns; for (i = 0; i < prog->len; i++) { code = insns[i].code; if (code == (BPF_JMP | BPF_TAIL_CALL)) { insns[i].code = BPF_JMP | BPF_CALL; insns[i].imm = BPF_FUNC_tail_call; /* fall-through */ } if (code == (BPF_JMP | BPF_CALL) || code == (BPF_JMP | BPF_CALL_ARGS)) { if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; if (!bpf_dump_raw_ok(f_cred)) insns[i].imm = 0; continue; } if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) { insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM; continue; } if ((BPF_CLASS(code) == BPF_LDX || BPF_CLASS(code) == BPF_STX || BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) { insns[i].code = BPF_CLASS(code) | BPF_SIZE(code) | BPF_MEM; continue; } if (code != (BPF_LD | BPF_IMM | BPF_DW)) continue; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; map = bpf_map_from_imm(prog, imm, &off, &type); if (map) { insns[i].src_reg = type; insns[i].imm = map->id; insns[i + 1].imm = off; continue; } } return insns; } static int set_info_rec_size(struct bpf_prog_info *info) { /* * Ensure info.*_rec_size is the same as kernel expected size * * or * * Only allow zero *_rec_size if both _rec_size and _cnt are * zero. In this case, the kernel will set the expected * _rec_size back to the info. */ if ((info->nr_func_info || info->func_info_rec_size) && info->func_info_rec_size != sizeof(struct bpf_func_info)) return -EINVAL; if ((info->nr_line_info || info->line_info_rec_size) && info->line_info_rec_size != sizeof(struct bpf_line_info)) return -EINVAL; if ((info->nr_jited_line_info || info->jited_line_info_rec_size) && info->jited_line_info_rec_size != sizeof(__u64)) return -EINVAL; info->func_info_rec_size = sizeof(struct bpf_func_info); info->line_info_rec_size = sizeof(struct bpf_line_info); info->jited_line_info_rec_size = sizeof(__u64); return 0; } static int bpf_prog_get_info_by_fd(struct file *file, struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct btf *attach_btf = bpf_prog_get_target_btf(prog); struct bpf_prog_info info; u32 info_len = attr->info.info_len; struct bpf_prog_kstats stats; char __user *uinsns; u32 ulen; int err; err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_len)) return -EFAULT; info.type = prog->type; info.id = prog->aux->id; info.load_time = prog->aux->load_time; info.created_by_uid = from_kuid_munged(current_user_ns(), prog->aux->user->uid); info.gpl_compatible = prog->gpl_compatible; memcpy(info.tag, prog->tag, sizeof(prog->tag)); memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); mutex_lock(&prog->aux->used_maps_mutex); ulen = info.nr_map_ids; info.nr_map_ids = prog->aux->used_map_cnt; ulen = min_t(u32, info.nr_map_ids, ulen); if (ulen) { u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); u32 i; for (i = 0; i < ulen; i++) if (put_user(prog->aux->used_maps[i]->id, &user_map_ids[i])) { mutex_unlock(&prog->aux->used_maps_mutex); return -EFAULT; } } mutex_unlock(&prog->aux->used_maps_mutex); err = set_info_rec_size(&info); if (err) return err; bpf_prog_get_stats(prog, &stats); info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; info.recursion_misses = stats.misses; info.verified_insns = prog->aux->verified_insns; if (!bpf_capable()) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; info.nr_jited_func_lens = 0; info.nr_func_info = 0; info.nr_line_info = 0; info.nr_jited_line_info = 0; goto done; } ulen = info.xlated_prog_len; info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { struct bpf_insn *insns_sanitized; bool fault; if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { info.xlated_prog_insns = 0; goto done; } insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); if (!insns_sanitized) return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); ulen = min_t(u32, info.xlated_prog_len, ulen); fault = copy_to_user(uinsns, insns_sanitized, ulen); kfree(insns_sanitized); if (fault) return -EFAULT; } if (bpf_prog_is_offloaded(prog->aux)) { err = bpf_prog_offload_info_fill(&info, prog); if (err) return err; goto done; } /* NOTE: the following code is supposed to be skipped for offload. * bpf_prog_offload_info_fill() is the place to fill similar fields * for offload. */ ulen = info.jited_prog_len; if (prog->aux->func_cnt) { u32 i; info.jited_prog_len = 0; for (i = 0; i < prog->aux->func_cnt; i++) info.jited_prog_len += prog->aux->func[i]->jited_len; } else { info.jited_prog_len = prog->jited_len; } if (info.jited_prog_len && ulen) { if (bpf_dump_raw_ok(file->f_cred)) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); /* for multi-function programs, copy the JITed * instructions for all the functions */ if (prog->aux->func_cnt) { u32 len, free, i; u8 *img; free = ulen; for (i = 0; i < prog->aux->func_cnt; i++) { len = prog->aux->func[i]->jited_len; len = min_t(u32, len, free); img = (u8 *) prog->aux->func[i]->bpf_func; if (copy_to_user(uinsns, img, len)) return -EFAULT; uinsns += len; free -= len; if (!free) break; } } else { if (copy_to_user(uinsns, prog->bpf_func, ulen)) return -EFAULT; } } else { info.jited_prog_insns = 0; } } ulen = info.nr_jited_ksyms; info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; if (ulen) { if (bpf_dump_raw_ok(file->f_cred)) { unsigned long ksym_addr; u64 __user *user_ksyms; u32 i; /* copy the address of the kernel symbol * corresponding to each function */ ulen = min_t(u32, info.nr_jited_ksyms, ulen); user_ksyms = u64_to_user_ptr(info.jited_ksyms); if (prog->aux->func_cnt) { for (i = 0; i < ulen; i++) { ksym_addr = (unsigned long) prog->aux->func[i]->bpf_func; if (put_user((u64) ksym_addr, &user_ksyms[i])) return -EFAULT; } } else { ksym_addr = (unsigned long) prog->bpf_func; if (put_user((u64) ksym_addr, &user_ksyms[0])) return -EFAULT; } } else { info.jited_ksyms = 0; } } ulen = info.nr_jited_func_lens; info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; if (ulen) { if (bpf_dump_raw_ok(file->f_cred)) { u32 __user *user_lens; u32 func_len, i; /* copy the JITed image lengths for each function */ ulen = min_t(u32, info.nr_jited_func_lens, ulen); user_lens = u64_to_user_ptr(info.jited_func_lens); if (prog->aux->func_cnt) { for (i = 0; i < ulen; i++) { func_len = prog->aux->func[i]->jited_len; if (put_user(func_len, &user_lens[i])) return -EFAULT; } } else { func_len = prog->jited_len; if (put_user(func_len, &user_lens[0])) return -EFAULT; } } else { info.jited_func_lens = 0; } } if (prog->aux->btf) info.btf_id = btf_obj_id(prog->aux->btf); info.attach_btf_id = prog->aux->attach_btf_id; if (attach_btf) info.attach_btf_obj_id = btf_obj_id(attach_btf); ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; if (info.nr_func_info && ulen) { char __user *user_finfo; user_finfo = u64_to_user_ptr(info.func_info); ulen = min_t(u32, info.nr_func_info, ulen); if (copy_to_user(user_finfo, prog->aux->func_info, info.func_info_rec_size * ulen)) return -EFAULT; } ulen = info.nr_line_info; info.nr_line_info = prog->aux->nr_linfo; if (info.nr_line_info && ulen) { __u8 __user *user_linfo; user_linfo = u64_to_user_ptr(info.line_info); ulen = min_t(u32, info.nr_line_info, ulen); if (copy_to_user(user_linfo, prog->aux->linfo, info.line_info_rec_size * ulen)) return -EFAULT; } ulen = info.nr_jited_line_info; if (prog->aux->jited_linfo) info.nr_jited_line_info = prog->aux->nr_linfo; else info.nr_jited_line_info = 0; if (info.nr_jited_line_info && ulen) { if (bpf_dump_raw_ok(file->f_cred)) { unsigned long line_addr; __u64 __user *user_linfo; u32 i; user_linfo = u64_to_user_ptr(info.jited_line_info); ulen = min_t(u32, info.nr_jited_line_info, ulen); for (i = 0; i < ulen; i++) { line_addr = (unsigned long)prog->aux->jited_linfo[i]; if (put_user((__u64)line_addr, &user_linfo[i])) return -EFAULT; } } else { info.jited_line_info = 0; } } ulen = info.nr_prog_tags; info.nr_prog_tags = prog->aux->func_cnt ? : 1; if (ulen) { __u8 __user (*user_prog_tags)[BPF_TAG_SIZE]; u32 i; user_prog_tags = u64_to_user_ptr(info.prog_tags); ulen = min_t(u32, info.nr_prog_tags, ulen); if (prog->aux->func_cnt) { for (i = 0; i < ulen; i++) { if (copy_to_user(user_prog_tags[i], prog->aux->func[i]->tag, BPF_TAG_SIZE)) return -EFAULT; } } else { if (copy_to_user(user_prog_tags[0], prog->tag, BPF_TAG_SIZE)) return -EFAULT; } } done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; return 0; } static int bpf_map_get_info_by_fd(struct file *file, struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_map_info info; u32 info_len = attr->info.info_len; int err; err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); memset(&info, 0, sizeof(info)); info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; info.value_size = map->value_size; info.max_entries = map->max_entries; info.map_flags = map->map_flags; info.map_extra = map->map_extra; memcpy(info.name, map->name, sizeof(map->name)); if (map->btf) { info.btf_id = btf_obj_id(map->btf); info.btf_key_type_id = map->btf_key_type_id; info.btf_value_type_id = map->btf_value_type_id; } info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) bpf_map_struct_ops_info_fill(&info, map); if (bpf_map_is_offloaded(map)) { err = bpf_map_offload_info_fill(&info, map); if (err) return err; } if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; return 0; } static int bpf_btf_get_info_by_fd(struct file *file, struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info); u32 info_len = attr->info.info_len; int err; err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); if (err) return err; return btf_get_info_by_fd(btf, attr, uattr); } static int bpf_link_get_info_by_fd(struct file *file, struct bpf_link *link, const union bpf_attr *attr, union bpf_attr __user *uattr) { struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_link_info info; u32 info_len = attr->info.info_len; int err; err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); memset(&info, 0, sizeof(info)); if (copy_from_user(&info, uinfo, info_len)) return -EFAULT; info.type = link->type; info.id = link->id; if (link->prog) info.prog_id = link->prog->aux->id; if (link->ops->fill_link_info) { err = link->ops->fill_link_info(link, &info); if (err) return err; } if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; return 0; } #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, union bpf_attr __user *uattr) { if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) return -EINVAL; CLASS(fd, f)(attr->info.bpf_fd); if (fd_empty(f)) return -EBADFD; if (fd_file(f)->f_op == &bpf_prog_fops) return bpf_prog_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); else if (fd_file(f)->f_op == &bpf_map_fops) return bpf_map_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); else if (fd_file(f)->f_op == &btf_fops) return bpf_btf_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll) return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); return -EINVAL; } #define BPF_BTF_LOAD_LAST_FIELD btf_token_fd static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { struct bpf_token *token = NULL; if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; if (attr->btf_flags & ~BPF_F_TOKEN_FD) return -EINVAL; if (attr->btf_flags & BPF_F_TOKEN_FD) { token = bpf_token_get_from_fd(attr->btf_token_fd); if (IS_ERR(token)) return PTR_ERR(token); if (!bpf_token_allow_cmd(token, BPF_BTF_LOAD)) { bpf_token_put(token); token = NULL; } } if (!bpf_token_capable(token, CAP_BPF)) { bpf_token_put(token); return -EPERM; } bpf_token_put(token); return btf_new_fd(attr, uattr, uattr_size); } #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) { if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; return btf_get_fd_by_id(attr->btf_id); } static int bpf_task_fd_query_copy(const union bpf_attr *attr, union bpf_attr __user *uattr, u32 prog_id, u32 fd_type, const char *buf, u64 probe_offset, u64 probe_addr) { char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); u32 len = buf ? strlen(buf) : 0, input_len; int err = 0; if (put_user(len, &uattr->task_fd_query.buf_len)) return -EFAULT; input_len = attr->task_fd_query.buf_len; if (input_len && ubuf) { if (!len) { /* nothing to copy, just make ubuf NULL terminated */ char zero = '\0'; if (put_user(zero, ubuf)) return -EFAULT; } else if (input_len >= len + 1) { /* ubuf can hold the string with NULL terminator */ if (copy_to_user(ubuf, buf, len + 1)) return -EFAULT; } else { /* ubuf cannot hold the string with NULL terminator, * do a partial copy with NULL terminator. */ char zero = '\0'; err = -ENOSPC; if (copy_to_user(ubuf, buf, input_len - 1)) return -EFAULT; if (put_user(zero, ubuf + input_len - 1)) return -EFAULT; } } if (put_user(prog_id, &uattr->task_fd_query.prog_id) || put_user(fd_type, &uattr->task_fd_query.fd_type) || put_user(probe_offset, &uattr->task_fd_query.probe_offset) || put_user(probe_addr, &uattr->task_fd_query.probe_addr)) return -EFAULT; return err; } #define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr static int bpf_task_fd_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { pid_t pid = attr->task_fd_query.pid; u32 fd = attr->task_fd_query.fd; const struct perf_event *event; struct task_struct *task; struct file *file; int err; if (CHECK_ATTR(BPF_TASK_FD_QUERY)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (attr->task_fd_query.flags != 0) return -EINVAL; rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_PID); rcu_read_unlock(); if (!task) return -ENOENT; err = 0; file = fget_task(task, fd); put_task_struct(task); if (!file) return -EBADF; if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) { struct bpf_link *link = file->private_data; if (link->ops == &bpf_raw_tp_link_lops) { struct bpf_raw_tp_link *raw_tp = container_of(link, struct bpf_raw_tp_link, link); struct bpf_raw_event_map *btp = raw_tp->btp; err = bpf_task_fd_query_copy(attr, uattr, raw_tp->link.prog->aux->id, BPF_FD_TYPE_RAW_TRACEPOINT, btp->tp->name, 0, 0); goto put_file; } goto out_not_supp; } event = perf_get_event(file); if (!IS_ERR(event)) { u64 probe_offset, probe_addr; u32 prog_id, fd_type; const char *buf; err = bpf_get_perf_event_info(event, &prog_id, &fd_type, &buf, &probe_offset, &probe_addr, NULL); if (!err) err = bpf_task_fd_query_copy(attr, uattr, prog_id, fd_type, buf, probe_offset, probe_addr); goto put_file; } out_not_supp: err = -ENOTSUPP; put_file: fput(file); return err; } #define BPF_MAP_BATCH_LAST_FIELD batch.flags #define BPF_DO_BATCH(fn, ...) \ do { \ if (!fn) { \ err = -ENOTSUPP; \ goto err_put; \ } \ err = fn(__VA_ARGS__); \ } while (0) static int bpf_map_do_batch(const union bpf_attr *attr, union bpf_attr __user *uattr, int cmd) { bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; struct bpf_map *map; int err; if (CHECK_ATTR(BPF_MAP_BATCH)) return -EINVAL; CLASS(fd, f)(attr->batch.map_fd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); if (has_write) bpf_map_write_active_inc(map); if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } if (cmd == BPF_MAP_LOOKUP_BATCH) BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr); else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr); else if (cmd == BPF_MAP_UPDATE_BATCH) BPF_DO_BATCH(map->ops->map_update_batch, map, fd_file(f), attr, uattr); else BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr); err_put: if (has_write) { maybe_wait_bpf_programs(map); bpf_map_write_active_dec(map); } return err; } #define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid static int link_create(union bpf_attr *attr, bpfptr_t uattr) { struct bpf_prog *prog; int ret; if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; if (attr->link_create.attach_type == BPF_STRUCT_OPS) return bpf_struct_ops_link_create(attr); prog = bpf_prog_get(attr->link_create.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); ret = bpf_prog_attach_check_attach_type(prog, attr->link_create.attach_type); if (ret) goto out; switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; case BPF_PROG_TYPE_EXT: ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, attr->link_create.tracing.cookie); break; case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_TRACING: if (attr->link_create.attach_type != prog->expected_attach_type) { ret = -EINVAL; goto out; } if (prog->expected_attach_type == BPF_TRACE_RAW_TP) ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) ret = cgroup_bpf_link_attach(attr, prog); else ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, attr->link_create.tracing.cookie); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: ret = netns_bpf_link_create(attr, prog); break; case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_SKB: ret = sock_map_link_create(attr, prog); break; #ifdef CONFIG_NET case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; case BPF_PROG_TYPE_SCHED_CLS: if (attr->link_create.attach_type == BPF_TCX_INGRESS || attr->link_create.attach_type == BPF_TCX_EGRESS) ret = tcx_link_attach(attr, prog); else ret = netkit_link_attach(attr, prog); break; case BPF_PROG_TYPE_NETFILTER: ret = bpf_nf_link_attach(attr, prog); break; #endif case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: ret = bpf_perf_link_attach(attr, prog); break; case BPF_PROG_TYPE_KPROBE: if (attr->link_create.attach_type == BPF_PERF_EVENT) ret = bpf_perf_link_attach(attr, prog); else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI || attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION) ret = bpf_kprobe_multi_link_attach(attr, prog); else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI || attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION) ret = bpf_uprobe_multi_link_attach(attr, prog); break; default: ret = -EINVAL; } out: if (ret < 0) bpf_prog_put(prog); return ret; } static int link_update_map(struct bpf_link *link, union bpf_attr *attr) { struct bpf_map *new_map, *old_map = NULL; int ret; new_map = bpf_map_get(attr->link_update.new_map_fd); if (IS_ERR(new_map)) return PTR_ERR(new_map); if (attr->link_update.flags & BPF_F_REPLACE) { old_map = bpf_map_get(attr->link_update.old_map_fd); if (IS_ERR(old_map)) { ret = PTR_ERR(old_map); goto out_put; } } else if (attr->link_update.old_map_fd) { ret = -EINVAL; goto out_put; } ret = link->ops->update_map(link, new_map, old_map); if (old_map) bpf_map_put(old_map); out_put: bpf_map_put(new_map); return ret; } #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd static int link_update(union bpf_attr *attr) { struct bpf_prog *old_prog = NULL, *new_prog; struct bpf_link *link; u32 flags; int ret; if (CHECK_ATTR(BPF_LINK_UPDATE)) return -EINVAL; flags = attr->link_update.flags; if (flags & ~BPF_F_REPLACE) return -EINVAL; link = bpf_link_get_from_fd(attr->link_update.link_fd); if (IS_ERR(link)) return PTR_ERR(link); if (link->ops->update_map) { ret = link_update_map(link, attr); goto out_put_link; } new_prog = bpf_prog_get(attr->link_update.new_prog_fd); if (IS_ERR(new_prog)) { ret = PTR_ERR(new_prog); goto out_put_link; } if (flags & BPF_F_REPLACE) { old_prog = bpf_prog_get(attr->link_update.old_prog_fd); if (IS_ERR(old_prog)) { ret = PTR_ERR(old_prog); old_prog = NULL; goto out_put_progs; } } else if (attr->link_update.old_prog_fd) { ret = -EINVAL; goto out_put_progs; } if (link->ops->update_prog) ret = link->ops->update_prog(link, new_prog, old_prog); else ret = -EINVAL; out_put_progs: if (old_prog) bpf_prog_put(old_prog); if (ret) bpf_prog_put(new_prog); out_put_link: bpf_link_put_direct(link); return ret; } #define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd static int link_detach(union bpf_attr *attr) { struct bpf_link *link; int ret; if (CHECK_ATTR(BPF_LINK_DETACH)) return -EINVAL; link = bpf_link_get_from_fd(attr->link_detach.link_fd); if (IS_ERR(link)) return PTR_ERR(link); if (link->ops->detach) ret = link->ops->detach(link); else ret = -EOPNOTSUPP; bpf_link_put_direct(link); return ret; } struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) { return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT); } EXPORT_SYMBOL(bpf_link_inc_not_zero); struct bpf_link *bpf_link_by_id(u32 id) { struct bpf_link *link; if (!id) return ERR_PTR(-ENOENT); spin_lock_bh(&link_idr_lock); /* before link is "settled", ID is 0, pretend it doesn't exist yet */ link = idr_find(&link_idr, id); if (link) { if (link->id) link = bpf_link_inc_not_zero(link); else link = ERR_PTR(-EAGAIN); } else { link = ERR_PTR(-ENOENT); } spin_unlock_bh(&link_idr_lock); return link; } struct bpf_link *bpf_link_get_curr_or_next(u32 *id) { struct bpf_link *link; spin_lock_bh(&link_idr_lock); again: link = idr_get_next(&link_idr, id); if (link) { link = bpf_link_inc_not_zero(link); if (IS_ERR(link)) { (*id)++; goto again; } } spin_unlock_bh(&link_idr_lock); return link; } #define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id static int bpf_link_get_fd_by_id(const union bpf_attr *attr) { struct bpf_link *link; u32 id = attr->link_id; int fd; if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; link = bpf_link_by_id(id); if (IS_ERR(link)) return PTR_ERR(link); fd = bpf_link_new_fd(link); if (fd < 0) bpf_link_put_direct(link); return fd; } DEFINE_MUTEX(bpf_stats_enabled_mutex); static int bpf_stats_release(struct inode *inode, struct file *file) { mutex_lock(&bpf_stats_enabled_mutex); static_key_slow_dec(&bpf_stats_enabled_key.key); mutex_unlock(&bpf_stats_enabled_mutex); return 0; } static const struct file_operations bpf_stats_fops = { .release = bpf_stats_release, }; static int bpf_enable_runtime_stats(void) { int fd; mutex_lock(&bpf_stats_enabled_mutex); /* Set a very high limit to avoid overflow */ if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) { mutex_unlock(&bpf_stats_enabled_mutex); return -EBUSY; } fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, O_CLOEXEC); if (fd >= 0) static_key_slow_inc(&bpf_stats_enabled_key.key); mutex_unlock(&bpf_stats_enabled_mutex); return fd; } #define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type static int bpf_enable_stats(union bpf_attr *attr) { if (CHECK_ATTR(BPF_ENABLE_STATS)) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; switch (attr->enable_stats.type) { case BPF_STATS_RUN_TIME: return bpf_enable_runtime_stats(); default: break; } return -EINVAL; } #define BPF_ITER_CREATE_LAST_FIELD iter_create.flags static int bpf_iter_create(union bpf_attr *attr) { struct bpf_link *link; int err; if (CHECK_ATTR(BPF_ITER_CREATE)) return -EINVAL; if (attr->iter_create.flags) return -EINVAL; link = bpf_link_get_from_fd(attr->iter_create.link_fd); if (IS_ERR(link)) return PTR_ERR(link); err = bpf_iter_new_fd(link); bpf_link_put_direct(link); return err; } #define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags static int bpf_prog_bind_map(union bpf_attr *attr) { struct bpf_prog *prog; struct bpf_map *map; struct bpf_map **used_maps_old, **used_maps_new; int i, ret = 0; if (CHECK_ATTR(BPF_PROG_BIND_MAP)) return -EINVAL; if (attr->prog_bind_map.flags) return -EINVAL; prog = bpf_prog_get(attr->prog_bind_map.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); map = bpf_map_get(attr->prog_bind_map.map_fd); if (IS_ERR(map)) { ret = PTR_ERR(map); goto out_prog_put; } mutex_lock(&prog->aux->used_maps_mutex); used_maps_old = prog->aux->used_maps; for (i = 0; i < prog->aux->used_map_cnt; i++) if (used_maps_old[i] == map) { bpf_map_put(map); goto out_unlock; } used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, sizeof(used_maps_new[0]), GFP_KERNEL); if (!used_maps_new) { ret = -ENOMEM; goto out_unlock; } /* The bpf program will not access the bpf map, but for the sake of * simplicity, increase sleepable_refcnt for sleepable program as well. */ if (prog->sleepable) atomic64_inc(&map->sleepable_refcnt); memcpy(used_maps_new, used_maps_old, sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); used_maps_new[prog->aux->used_map_cnt] = map; prog->aux->used_map_cnt++; prog->aux->used_maps = used_maps_new; kfree(used_maps_old); out_unlock: mutex_unlock(&prog->aux->used_maps_mutex); if (ret) bpf_map_put(map); out_prog_put: bpf_prog_put(prog); return ret; } #define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd static int token_create(union bpf_attr *attr) { if (CHECK_ATTR(BPF_TOKEN_CREATE)) return -EINVAL; /* no flags are supported yet */ if (attr->token_create.flags) return -EINVAL; return bpf_token_create(attr); } static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; int err; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err; size = min_t(u32, size, sizeof(attr)); /* copy attributes from user space, may be less than sizeof(bpf_attr) */ memset(&attr, 0, sizeof(attr)); if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; err = security_bpf(cmd, &attr, size); if (err < 0) return err; switch (cmd) { case BPF_MAP_CREATE: err = map_create(&attr); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); break; case BPF_MAP_UPDATE_ELEM: err = map_update_elem(&attr, uattr); break; case BPF_MAP_DELETE_ELEM: err = map_delete_elem(&attr, uattr); break; case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; case BPF_MAP_FREEZE: err = map_freeze(&attr); break; case BPF_PROG_LOAD: err = bpf_prog_load(&attr, uattr, size); break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); break; case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; case BPF_PROG_ATTACH: err = bpf_prog_attach(&attr); break; case BPF_PROG_DETACH: err = bpf_prog_detach(&attr); break; case BPF_PROG_QUERY: err = bpf_prog_query(&attr, uattr.user); break; case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr.user); break; case BPF_PROG_GET_NEXT_ID: err = bpf_obj_get_next_id(&attr, uattr.user, &prog_idr, &prog_idr_lock); break; case BPF_MAP_GET_NEXT_ID: err = bpf_obj_get_next_id(&attr, uattr.user, &map_idr, &map_idr_lock); break; case BPF_BTF_GET_NEXT_ID: err = bpf_obj_get_next_id(&attr, uattr.user, &btf_idr, &btf_idr_lock); break; case BPF_PROG_GET_FD_BY_ID: err = bpf_prog_get_fd_by_id(&attr); break; case BPF_MAP_GET_FD_BY_ID: err = bpf_map_get_fd_by_id(&attr); break; case BPF_OBJ_GET_INFO_BY_FD: err = bpf_obj_get_info_by_fd(&attr, uattr.user); break; case BPF_RAW_TRACEPOINT_OPEN: err = bpf_raw_tracepoint_open(&attr); break; case BPF_BTF_LOAD: err = bpf_btf_load(&attr, uattr, size); break; case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); break; case BPF_TASK_FD_QUERY: err = bpf_task_fd_query(&attr, uattr.user); break; case BPF_MAP_LOOKUP_AND_DELETE_ELEM: err = map_lookup_and_delete_elem(&attr); break; case BPF_MAP_LOOKUP_BATCH: err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_BATCH); break; case BPF_MAP_LOOKUP_AND_DELETE_BATCH: err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_LOOKUP_AND_DELETE_BATCH); break; case BPF_MAP_UPDATE_BATCH: err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_UPDATE_BATCH); break; case BPF_MAP_DELETE_BATCH: err = bpf_map_do_batch(&attr, uattr.user, BPF_MAP_DELETE_BATCH); break; case BPF_LINK_CREATE: err = link_create(&attr, uattr); break; case BPF_LINK_UPDATE: err = link_update(&attr); break; case BPF_LINK_GET_FD_BY_ID: err = bpf_link_get_fd_by_id(&attr); break; case BPF_LINK_GET_NEXT_ID: err = bpf_obj_get_next_id(&attr, uattr.user, &link_idr, &link_idr_lock); break; case BPF_ENABLE_STATS: err = bpf_enable_stats(&attr); break; case BPF_ITER_CREATE: err = bpf_iter_create(&attr); break; case BPF_LINK_DETACH: err = link_detach(&attr); break; case BPF_PROG_BIND_MAP: err = bpf_prog_bind_map(&attr); break; case BPF_TOKEN_CREATE: err = token_create(&attr); break; default: err = -EINVAL; break; } return err; } SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { return __sys_bpf(cmd, USER_BPFPTR(uattr), size); } static bool syscall_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= U16_MAX) return false; if (off % size != 0) return false; return true; } BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { switch (cmd) { case BPF_MAP_CREATE: case BPF_MAP_DELETE_ELEM: case BPF_MAP_UPDATE_ELEM: case BPF_MAP_FREEZE: case BPF_MAP_GET_FD_BY_ID: case BPF_PROG_LOAD: case BPF_BTF_LOAD: case BPF_LINK_CREATE: case BPF_RAW_TRACEPOINT_OPEN: break; default: return -EINVAL; } return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); } /* To shut up -Wmissing-prototypes. * This function is used by the kernel light skeleton * to load bpf programs when modules are loaded or during kernel boot. * See tools/lib/bpf/skel_internal.h */ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) { struct bpf_prog * __maybe_unused prog; struct bpf_tramp_run_ctx __maybe_unused run_ctx; switch (cmd) { #ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ case BPF_PROG_TEST_RUN: if (attr->test.data_in || attr->test.data_out || attr->test.ctx_out || attr->test.duration || attr->test.repeat || attr->test.flags) return -EINVAL; prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL); if (IS_ERR(prog)) return PTR_ERR(prog); if (attr->test.ctx_size_in < prog->aux->max_ctx_offset || attr->test.ctx_size_in > U16_MAX) { bpf_prog_put(prog); return -EINVAL; } run_ctx.bpf_cookie = 0; if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { /* recursion detected */ __bpf_prog_exit_sleepable_recur(prog, 0, &run_ctx); bpf_prog_put(prog); return -EBUSY; } attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx); bpf_prog_put(prog); return 0; #endif default: return ____bpf_sys_bpf(cmd, attr, size); } } EXPORT_SYMBOL(kern_sys_bpf); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; const struct bpf_func_proto * __weak tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return bpf_base_func_proto(func_id, prog); } BPF_CALL_1(bpf_sys_close, u32, fd) { /* When bpf program calls this helper there should not be * an fdget() without matching completed fdput(). * This helper is allowed in the following callchain only: * sys_bpf->prog_test_run->bpf_prog->bpf_sys_close */ return close_fd(fd); } static const struct bpf_func_proto bpf_sys_close_proto = { .func = bpf_sys_close, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res) { *res = 0; if (flags) return -EINVAL; if (name_sz <= 1 || name[name_sz - 1]) return -EINVAL; if (!bpf_dump_raw_ok(current_cred())) return -EPERM; *res = kallsyms_lookup_name(name); return *res ? 0 : -ENOENT; } static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .func = bpf_kallsyms_lookup_name, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; static const struct bpf_func_proto * syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sys_bpf: return !bpf_token_capable(prog->aux->token, CAP_PERFMON) ? NULL : &bpf_sys_bpf_proto; case BPF_FUNC_btf_find_by_name_kind: return &bpf_btf_find_by_name_kind_proto; case BPF_FUNC_sys_close: return &bpf_sys_close_proto; case BPF_FUNC_kallsyms_lookup_name: return &bpf_kallsyms_lookup_name_proto; default: return tracing_prog_func_proto(func_id, prog); } } const struct bpf_verifier_ops bpf_syscall_verifier_ops = { .get_func_proto = syscall_prog_func_proto, .is_valid_access = syscall_prog_is_valid_access, }; const struct bpf_prog_ops bpf_syscall_prog_ops = { .test_run = bpf_prog_test_run_syscall, }; #ifdef CONFIG_SYSCTL static int bpf_stats_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static int saved_val; int val, ret; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(val), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; mutex_lock(&bpf_stats_enabled_mutex); val = saved_val; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret && val != saved_val) { if (val) static_key_slow_inc(key); else static_key_slow_dec(key); saved_val = val; } mutex_unlock(&bpf_stats_enabled_mutex); return ret; } void __weak unpriv_ebpf_notify(int new_state) { } static int bpf_unpriv_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, unpriv_enable = *(int *)table->data; bool locked_state = unpriv_enable == 1; struct ctl_table tmp = *table; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; tmp.data = &unpriv_enable; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { if (locked_state && unpriv_enable != 1) return -EPERM; *(int *)table->data = unpriv_enable; } if (write) unpriv_ebpf_notify(unpriv_enable); return ret; } static struct ctl_table bpf_syscall_table[] = { { .procname = "unprivileged_bpf_disabled", .data = &sysctl_unprivileged_bpf_disabled, .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), .mode = 0644, .proc_handler = bpf_unpriv_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "bpf_stats_enabled", .data = &bpf_stats_enabled_key.key, .mode = 0644, .proc_handler = bpf_stats_handler, }, }; static int __init bpf_syscall_sysctl_init(void) { register_sysctl_init("kernel", bpf_syscall_table); return 0; } late_initcall(bpf_syscall_sysctl_init); #endif /* CONFIG_SYSCTL */
26 26 26 26 26 26 26 26 3 5 5 5 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 Syncookies implementation for the Linux kernel * * Authors: * Glenn Griffin <ggriffin.kernel@gmail.com> * * Based on IPv4 implementation by Andi Kleen * linux/net/ipv4/syncookies.c */ #include <linux/tcp.h> #include <linux/random.h> #include <linux/siphash.h> #include <linux/kernel.h> #include <net/secure_seq.h> #include <net/ipv6.h> #include <net/tcp.h> #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) static siphash_aligned_key_t syncookie6_secret[2]; /* RFC 2460, Section 8.3: * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..] * * Due to IPV6_MIN_MTU=1280 the lowest possible MSS is 1220, which allows * using higher values than ipv4 tcp syncookies. * The other values are chosen based on ethernet (1500 and 9k MTU), plus * one that accounts for common encap (PPPoe) overhead. Table must be sorted. */ static __u16 const msstab[] = { 1280 - 60, /* IPV6_MIN_MTU - 60 */ 1480 - 60, 1500 - 60, 9000 - 60, }; static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) { const struct { struct in6_addr saddr; struct in6_addr daddr; u32 count; __be16 sport; __be16 dport; } __aligned(SIPHASH_ALIGNMENT) combined = { .saddr = *saddr, .daddr = *daddr, .count = count, .sport = sport, .dport = dport }; net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret)); return siphash(&combined, offsetofend(typeof(combined), dport), &syncookie6_secret[c]); } static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, __u32 sseq, __u32 data) { u32 count = tcp_cookie_time(); return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + (count << COOKIEBITS) + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & COOKIEMASK)); } static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, __u32 sseq) { __u32 diff, count = tcp_cookie_time(); cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; return (cookie - cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) & COOKIEMASK; } u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, __u16 *mssp) { int mssind; const __u16 mss = *mssp; for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) if (mss >= msstab[mssind]) break; *mssp = msstab[mssind]; return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source, th->dest, ntohl(th->seq), mssind); } EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); return __cookie_v6_init_sequence(iph, th, mssp); } int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th) { __u32 cookie = ntohl(th->ack_seq) - 1; __u32 seq = ntohl(th->seq) - 1; __u32 mssind; mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } EXPORT_SYMBOL_GPL(__cookie_v6_check); static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, struct sk_buff *skb) { struct tcp_options_received tcp_opt; u32 tsoff = 0; int mss; if (tcp_synq_no_recent_overflow(sk)) goto out; mss = __cookie_v6_check(ipv6_hdr(skb), tcp_hdr(skb)); if (!mss) { __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED); goto out; } __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(net, skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { tsoff = secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32); tcp_opt.rcv_tsecr -= tsoff; } if (!cookie_timestamp_decode(net, &tcp_opt)) goto out; return cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, sk, skb, &tcp_opt, mss, tsoff); out: return ERR_PTR(-EINVAL); } struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); struct request_sock *req; struct dst_entry *dst; struct sock *ret = sk; __u8 rcv_wscale; int full_space; SKB_DR(reason); if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || !th->ack || th->rst) goto out; if (cookie_bpf_ok(skb)) { req = cookie_bpf_check(sk, skb); } else { req = cookie_tcp_check(net, sk, skb); if (IS_ERR(req)) goto out; } if (!req) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } ireq = inet_rsk(req); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; if (security_inet_conn_request(sk, skb, req)) { SKB_DR_SET(reason, SECURITY_HOOK); goto out_free; } if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { refcount_inc(&skb->users); ireq->pktopts = skb; } /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); tcp_ao_syncookie(sk, skb, req, AF_INET6); /* * We need to lookup the dst_entry to get the correct window size. * This is taken from tcp_v6_syn_recv_sock. Somebody please enlighten * me if there is a preferred way. */ { struct in6_addr *final_p, final; struct flowi6 fl6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = ireq->ir_v6_rmt_addr; final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); fl6.saddr = ireq->ir_v6_loc_addr; fl6.flowi6_oif = ireq->ir_iif; fl6.flowi6_mark = ireq->ir_mark; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; fl6.flowi6_uid = sk->sk_uid; security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); if (IS_ERR(dst)) { SKB_DR_SET(reason, IP_OUTNOROUTES); goto out_free; } } req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ full_space = tcp_full_space(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); /* req->syncookie is set true only if ACK is validated * by BPF kfunc, then, rcv_wscale is already configured. */ if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, dst); ret = tcp_get_cookie_sock(sk, skb, req, dst); if (!ret) { SKB_DR_SET(reason, NO_SOCKET); goto out_drop; } out: return ret; out_free: reqsk_free(req); out_drop: sk_skb_reason_drop(sk, skb, reason); return NULL; }
9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 // SPDX-License-Identifier: GPL-2.0 /* * linux/ipc/msg.c * Copyright (C) 1992 Krishna Balasubramanian * * Removed all the remaining kerneld mess * Catch the -EFAULT stuff properly * Use GFP_KERNEL for messages as in 1.2 * Fixed up the unchecked user space derefs * Copyright (C) 1998 Alan Cox & Andi Kleen * * /proc/sysvipc/msg support (c) 1999 Dragos Acostachioaie <dragos@iname.com> * * mostly rewritten, threaded and wake-one semantics added * MSGMAX limit removed, sysctl's added * (c) 1999 Manfred Spraul <manfred@colorfullife.com> * * support for audit of ipc object properties and permission changes * Dustin Kirkland <dustin.kirkland@us.ibm.com> * * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> */ #include <linux/capability.h> #include <linux/msg.h> #include <linux/spinlock.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/list.h> #include <linux/security.h> #include <linux/sched/wake_q.h> #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/seq_file.h> #include <linux/rwsem.h> #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> #include <linux/percpu_counter.h> #include <asm/current.h> #include <linux/uaccess.h> #include "util.h" /* one msq_queue structure for each present queue on the system */ struct msg_queue { struct kern_ipc_perm q_perm; time64_t q_stime; /* last msgsnd time */ time64_t q_rtime; /* last msgrcv time */ time64_t q_ctime; /* last change time */ unsigned long q_cbytes; /* current number of bytes on queue */ unsigned long q_qnum; /* number of messages in queue */ unsigned long q_qbytes; /* max number of bytes on queue */ struct pid *q_lspid; /* pid of last msgsnd */ struct pid *q_lrpid; /* last receive pid */ struct list_head q_messages; struct list_head q_receivers; struct list_head q_senders; } __randomize_layout; /* * MSG_BARRIER Locking: * * Similar to the optimization used in ipc/mqueue.c, one syscall return path * does not acquire any locks when it sees that a message exists in * msg_receiver.r_msg. Therefore r_msg is set using smp_store_release() * and accessed using READ_ONCE()+smp_acquire__after_ctrl_dep(). In addition, * wake_q_add_safe() is used. See ipc/mqueue.c for more details */ /* one msg_receiver structure for each sleeping receiver */ struct msg_receiver { struct list_head r_list; struct task_struct *r_tsk; int r_mode; long r_msgtype; long r_maxsize; struct msg_msg *r_msg; }; /* one msg_sender for each sleeping sender */ struct msg_sender { struct list_head list; struct task_struct *tsk; size_t msgsz; }; #define SEARCH_ANY 1 #define SEARCH_EQUAL 2 #define SEARCH_NOTEQUAL 3 #define SEARCH_LESSEQUAL 4 #define SEARCH_NUMBER 5 #define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS]) static inline struct msg_queue *msq_obtain_object(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&msg_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct msg_queue, q_perm); } static inline struct msg_queue *msq_obtain_object_check(struct ipc_namespace *ns, int id) { struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&msg_ids(ns), id); if (IS_ERR(ipcp)) return ERR_CAST(ipcp); return container_of(ipcp, struct msg_queue, q_perm); } static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) { ipc_rmid(&msg_ids(ns), &s->q_perm); } static void msg_rcu_free(struct rcu_head *head) { struct kern_ipc_perm *p = container_of(head, struct kern_ipc_perm, rcu); struct msg_queue *msq = container_of(p, struct msg_queue, q_perm); security_msg_queue_free(&msq->q_perm); kfree(msq); } /** * newque - Create a new msg queue * @ns: namespace * @params: ptr to the structure that contains the key and msgflg * * Called with msg_ids.rwsem held (writer) */ static int newque(struct ipc_namespace *ns, struct ipc_params *params) { struct msg_queue *msq; int retval; key_t key = params->key; int msgflg = params->flg; msq = kmalloc(sizeof(*msq), GFP_KERNEL_ACCOUNT); if (unlikely(!msq)) return -ENOMEM; msq->q_perm.mode = msgflg & S_IRWXUGO; msq->q_perm.key = key; msq->q_perm.security = NULL; retval = security_msg_queue_alloc(&msq->q_perm); if (retval) { kfree(msq); return retval; } msq->q_stime = msq->q_rtime = 0; msq->q_ctime = ktime_get_real_seconds(); msq->q_cbytes = msq->q_qnum = 0; msq->q_qbytes = ns->msg_ctlmnb; msq->q_lspid = msq->q_lrpid = NULL; INIT_LIST_HEAD(&msq->q_messages); INIT_LIST_HEAD(&msq->q_receivers); INIT_LIST_HEAD(&msq->q_senders); /* ipc_addid() locks msq upon success. */ retval = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); if (retval < 0) { ipc_rcu_putref(&msq->q_perm, msg_rcu_free); return retval; } ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); return msq->q_perm.id; } static inline bool msg_fits_inqueue(struct msg_queue *msq, size_t msgsz) { return msgsz + msq->q_cbytes <= msq->q_qbytes && 1 + msq->q_qnum <= msq->q_qbytes; } static inline void ss_add(struct msg_queue *msq, struct msg_sender *mss, size_t msgsz) { mss->tsk = current; mss->msgsz = msgsz; /* * No memory barrier required: we did ipc_lock_object(), * and the waker obtains that lock before calling wake_q_add(). */ __set_current_state(TASK_INTERRUPTIBLE); list_add_tail(&mss->list, &msq->q_senders); } static inline void ss_del(struct msg_sender *mss) { if (mss->list.next) list_del(&mss->list); } static void ss_wakeup(struct msg_queue *msq, struct wake_q_head *wake_q, bool kill) { struct msg_sender *mss, *t; struct task_struct *stop_tsk = NULL; struct list_head *h = &msq->q_senders; list_for_each_entry_safe(mss, t, h, list) { if (kill) mss->list.next = NULL; /* * Stop at the first task we don't wakeup, * we've already iterated the original * sender queue. */ else if (stop_tsk == mss->tsk) break; /* * We are not in an EIDRM scenario here, therefore * verify that we really need to wakeup the task. * To maintain current semantics and wakeup order, * move the sender to the tail on behalf of the * blocked task. */ else if (!msg_fits_inqueue(msq, mss->msgsz)) { if (!stop_tsk) stop_tsk = mss->tsk; list_move_tail(&mss->list, &msq->q_senders); continue; } wake_q_add(wake_q, mss->tsk); } } static void expunge_all(struct msg_queue *msq, int res, struct wake_q_head *wake_q) { struct msg_receiver *msr, *t; list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { struct task_struct *r_tsk; r_tsk = get_task_struct(msr->r_tsk); /* see MSG_BARRIER for purpose/pairing */ smp_store_release(&msr->r_msg, ERR_PTR(res)); wake_q_add_safe(wake_q, r_tsk); } } /* * freeque() wakes up waiters on the sender and receiver waiting queue, * removes the message queue from message queue ID IDR, and cleans up all the * messages associated with this queue. * * msg_ids.rwsem (writer) and the spinlock for this message queue are held * before freeque() is called. msg_ids.rwsem remains locked on exit. */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) __releases(RCU) __releases(&msq->q_perm) { struct msg_msg *msg, *t; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); DEFINE_WAKE_Q(wake_q); expunge_all(msq, -EIDRM, &wake_q); ss_wakeup(msq, &wake_q, true); msg_rmid(ns, msq); ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); free_msg(msg); } percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes); ipc_update_pid(&msq->q_lspid, NULL); ipc_update_pid(&msq->q_lrpid, NULL); ipc_rcu_putref(&msq->q_perm, msg_rcu_free); } long ksys_msgget(key_t key, int msgflg) { struct ipc_namespace *ns; static const struct ipc_ops msg_ops = { .getnew = newque, .associate = security_msg_queue_associate, }; struct ipc_params msg_params; ns = current->nsproxy->ipc_ns; msg_params.key = key; msg_params.flg = msgflg; return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); } SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg) { return ksys_msgget(key, msgflg); } static inline unsigned long copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version) { switch (version) { case IPC_64: return copy_to_user(buf, in, sizeof(*in)); case IPC_OLD: { struct msqid_ds out; memset(&out, 0, sizeof(out)); ipc64_perm_to_ipc_perm(&in->msg_perm, &out.msg_perm); out.msg_stime = in->msg_stime; out.msg_rtime = in->msg_rtime; out.msg_ctime = in->msg_ctime; if (in->msg_cbytes > USHRT_MAX) out.msg_cbytes = USHRT_MAX; else out.msg_cbytes = in->msg_cbytes; out.msg_lcbytes = in->msg_cbytes; if (in->msg_qnum > USHRT_MAX) out.msg_qnum = USHRT_MAX; else out.msg_qnum = in->msg_qnum; if (in->msg_qbytes > USHRT_MAX) out.msg_qbytes = USHRT_MAX; else out.msg_qbytes = in->msg_qbytes; out.msg_lqbytes = in->msg_qbytes; out.msg_lspid = in->msg_lspid; out.msg_lrpid = in->msg_lrpid; return copy_to_user(buf, &out, sizeof(out)); } default: return -EINVAL; } } static inline unsigned long copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) { switch (version) { case IPC_64: if (copy_from_user(out, buf, sizeof(*out))) return -EFAULT; return 0; case IPC_OLD: { struct msqid_ds tbuf_old; if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) return -EFAULT; out->msg_perm.uid = tbuf_old.msg_perm.uid; out->msg_perm.gid = tbuf_old.msg_perm.gid; out->msg_perm.mode = tbuf_old.msg_perm.mode; if (tbuf_old.msg_qbytes == 0) out->msg_qbytes = tbuf_old.msg_lqbytes; else out->msg_qbytes = tbuf_old.msg_qbytes; return 0; } default: return -EINVAL; } } /* * This function handles some msgctl commands which require the rwsem * to be held in write mode. * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, struct ipc64_perm *perm, int msg_qbytes) { struct kern_ipc_perm *ipcp; struct msg_queue *msq; int err; down_write(&msg_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_obtain_check(ns, &msg_ids(ns), msqid, cmd, perm, msg_qbytes); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto out_unlock1; } msq = container_of(ipcp, struct msg_queue, q_perm); err = security_msg_queue_msgctl(&msq->q_perm, cmd); if (err) goto out_unlock1; switch (cmd) { case IPC_RMID: ipc_lock_object(&msq->q_perm); /* freeque unlocks the ipc object and rcu */ freeque(ns, ipcp); goto out_up; case IPC_SET: { DEFINE_WAKE_Q(wake_q); if (msg_qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) { err = -EPERM; goto out_unlock1; } ipc_lock_object(&msq->q_perm); err = ipc_update_perm(perm, ipcp); if (err) goto out_unlock0; msq->q_qbytes = msg_qbytes; msq->q_ctime = ktime_get_real_seconds(); /* * Sleeping receivers might be excluded by * stricter permissions. */ expunge_all(msq, -EAGAIN, &wake_q); /* * Sleeping senders might be able to send * due to a larger queue size. */ ss_wakeup(msq, &wake_q, false); ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); goto out_unlock1; } default: err = -EINVAL; goto out_unlock1; } out_unlock0: ipc_unlock_object(&msq->q_perm); out_unlock1: rcu_read_unlock(); out_up: up_write(&msg_ids(ns).rwsem); return err; } static int msgctl_info(struct ipc_namespace *ns, int msqid, int cmd, struct msginfo *msginfo) { int err; int max_idx; /* * We must not return kernel stack data. * due to padding, it's not enough * to set all member fields. */ err = security_msg_queue_msgctl(NULL, cmd); if (err) return err; memset(msginfo, 0, sizeof(*msginfo)); msginfo->msgmni = ns->msg_ctlmni; msginfo->msgmax = ns->msg_ctlmax; msginfo->msgmnb = ns->msg_ctlmnb; msginfo->msgssz = MSGSSZ; msginfo->msgseg = MSGSEG; down_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) msginfo->msgpool = msg_ids(ns).in_use; max_idx = ipc_get_maxidx(&msg_ids(ns)); up_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) { msginfo->msgmap = min_t(int, percpu_counter_sum(&ns->percpu_msg_hdrs), INT_MAX); msginfo->msgtql = min_t(int, percpu_counter_sum(&ns->percpu_msg_bytes), INT_MAX); } else { msginfo->msgmap = MSGMAP; msginfo->msgpool = MSGPOOL; msginfo->msgtql = MSGTQL; } return (max_idx < 0) ? 0 : max_idx; } static int msgctl_stat(struct ipc_namespace *ns, int msqid, int cmd, struct msqid64_ds *p) { struct msg_queue *msq; int err; memset(p, 0, sizeof(*p)); rcu_read_lock(); if (cmd == MSG_STAT || cmd == MSG_STAT_ANY) { msq = msq_obtain_object(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); goto out_unlock; } } else { /* IPC_STAT */ msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); goto out_unlock; } } /* see comment for SHM_STAT_ANY */ if (cmd == MSG_STAT_ANY) audit_ipc_obj(&msq->q_perm); else { err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IRUGO)) goto out_unlock; } err = security_msg_queue_msgctl(&msq->q_perm, cmd); if (err) goto out_unlock; ipc_lock_object(&msq->q_perm); if (!ipc_valid_object(&msq->q_perm)) { ipc_unlock_object(&msq->q_perm); err = -EIDRM; goto out_unlock; } kernel_to_ipc64_perm(&msq->q_perm, &p->msg_perm); p->msg_stime = msq->q_stime; p->msg_rtime = msq->q_rtime; p->msg_ctime = msq->q_ctime; #ifndef CONFIG_64BIT p->msg_stime_high = msq->q_stime >> 32; p->msg_rtime_high = msq->q_rtime >> 32; p->msg_ctime_high = msq->q_ctime >> 32; #endif p->msg_cbytes = msq->q_cbytes; p->msg_qnum = msq->q_qnum; p->msg_qbytes = msq->q_qbytes; p->msg_lspid = pid_vnr(msq->q_lspid); p->msg_lrpid = pid_vnr(msq->q_lrpid); if (cmd == IPC_STAT) { /* * As defined in SUS: * Return 0 on success */ err = 0; } else { /* * MSG_STAT and MSG_STAT_ANY (both Linux specific) * Return the full id, including the sequence number */ err = msq->q_perm.id; } ipc_unlock_object(&msq->q_perm); out_unlock: rcu_read_unlock(); return err; } static long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf, int version) { struct ipc_namespace *ns; struct msqid64_ds msqid64; int err; if (msqid < 0 || cmd < 0) return -EINVAL; ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: case MSG_INFO: { struct msginfo msginfo; err = msgctl_info(ns, msqid, cmd, &msginfo); if (err < 0) return err; if (copy_to_user(buf, &msginfo, sizeof(struct msginfo))) err = -EFAULT; return err; } case MSG_STAT: /* msqid is an index rather than a msg queue id */ case MSG_STAT_ANY: case IPC_STAT: err = msgctl_stat(ns, msqid, cmd, &msqid64); if (err < 0) return err; if (copy_msqid_to_user(buf, &msqid64, version)) err = -EFAULT; return err; case IPC_SET: if (copy_msqid_from_user(&msqid64, buf, version)) return -EFAULT; return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes); case IPC_RMID: return msgctl_down(ns, msqid, cmd, NULL, 0); default: return -EINVAL; } } SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) { return ksys_msgctl(msqid, cmd, buf, IPC_64); } #ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION long ksys_old_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) { int version = ipc_parse_version(&cmd); return ksys_msgctl(msqid, cmd, buf, version); } SYSCALL_DEFINE3(old_msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) { return ksys_old_msgctl(msqid, cmd, buf); } #endif #ifdef CONFIG_COMPAT struct compat_msqid_ds { struct compat_ipc_perm msg_perm; compat_uptr_t msg_first; compat_uptr_t msg_last; old_time32_t msg_stime; old_time32_t msg_rtime; old_time32_t msg_ctime; compat_ulong_t msg_lcbytes; compat_ulong_t msg_lqbytes; unsigned short msg_cbytes; unsigned short msg_qnum; unsigned short msg_qbytes; compat_ipc_pid_t msg_lspid; compat_ipc_pid_t msg_lrpid; }; static int copy_compat_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) { memset(out, 0, sizeof(*out)); if (version == IPC_64) { struct compat_msqid64_ds __user *p = buf; if (get_compat_ipc64_perm(&out->msg_perm, &p->msg_perm)) return -EFAULT; if (get_user(out->msg_qbytes, &p->msg_qbytes)) return -EFAULT; } else { struct compat_msqid_ds __user *p = buf; if (get_compat_ipc_perm(&out->msg_perm, &p->msg_perm)) return -EFAULT; if (get_user(out->msg_qbytes, &p->msg_qbytes)) return -EFAULT; } return 0; } static int copy_compat_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version) { if (version == IPC_64) { struct compat_msqid64_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc64_perm(&v.msg_perm, &in->msg_perm); v.msg_stime = lower_32_bits(in->msg_stime); v.msg_stime_high = upper_32_bits(in->msg_stime); v.msg_rtime = lower_32_bits(in->msg_rtime); v.msg_rtime_high = upper_32_bits(in->msg_rtime); v.msg_ctime = lower_32_bits(in->msg_ctime); v.msg_ctime_high = upper_32_bits(in->msg_ctime); v.msg_cbytes = in->msg_cbytes; v.msg_qnum = in->msg_qnum; v.msg_qbytes = in->msg_qbytes; v.msg_lspid = in->msg_lspid; v.msg_lrpid = in->msg_lrpid; return copy_to_user(buf, &v, sizeof(v)); } else { struct compat_msqid_ds v; memset(&v, 0, sizeof(v)); to_compat_ipc_perm(&v.msg_perm, &in->msg_perm); v.msg_stime = in->msg_stime; v.msg_rtime = in->msg_rtime; v.msg_ctime = in->msg_ctime; v.msg_cbytes = in->msg_cbytes; v.msg_qnum = in->msg_qnum; v.msg_qbytes = in->msg_qbytes; v.msg_lspid = in->msg_lspid; v.msg_lrpid = in->msg_lrpid; return copy_to_user(buf, &v, sizeof(v)); } } static long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr, int version) { struct ipc_namespace *ns; int err; struct msqid64_ds msqid64; ns = current->nsproxy->ipc_ns; if (msqid < 0 || cmd < 0) return -EINVAL; switch (cmd & (~IPC_64)) { case IPC_INFO: case MSG_INFO: { struct msginfo msginfo; err = msgctl_info(ns, msqid, cmd, &msginfo); if (err < 0) return err; if (copy_to_user(uptr, &msginfo, sizeof(struct msginfo))) err = -EFAULT; return err; } case IPC_STAT: case MSG_STAT: case MSG_STAT_ANY: err = msgctl_stat(ns, msqid, cmd, &msqid64); if (err < 0) return err; if (copy_compat_msqid_to_user(uptr, &msqid64, version)) err = -EFAULT; return err; case IPC_SET: if (copy_compat_msqid_from_user(&msqid64, uptr, version)) return -EFAULT; return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes); case IPC_RMID: return msgctl_down(ns, msqid, cmd, NULL, 0); default: return -EINVAL; } } COMPAT_SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, void __user *, uptr) { return compat_ksys_msgctl(msqid, cmd, uptr, IPC_64); } #ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION long compat_ksys_old_msgctl(int msqid, int cmd, void __user *uptr) { int version = compat_ipc_parse_version(&cmd); return compat_ksys_msgctl(msqid, cmd, uptr, version); } COMPAT_SYSCALL_DEFINE3(old_msgctl, int, msqid, int, cmd, void __user *, uptr) { return compat_ksys_old_msgctl(msqid, cmd, uptr); } #endif #endif static int testmsg(struct msg_msg *msg, long type, int mode) { switch (mode) { case SEARCH_ANY: case SEARCH_NUMBER: return 1; case SEARCH_LESSEQUAL: if (msg->m_type <= type) return 1; break; case SEARCH_EQUAL: if (msg->m_type == type) return 1; break; case SEARCH_NOTEQUAL: if (msg->m_type != type) return 1; break; } return 0; } static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg, struct wake_q_head *wake_q) { struct msg_receiver *msr, *t; list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { if (testmsg(msg, msr->r_msgtype, msr->r_mode) && !security_msg_queue_msgrcv(&msq->q_perm, msg, msr->r_tsk, msr->r_msgtype, msr->r_mode)) { list_del(&msr->r_list); if (msr->r_maxsize < msg->m_ts) { wake_q_add(wake_q, msr->r_tsk); /* See expunge_all regarding memory barrier */ smp_store_release(&msr->r_msg, ERR_PTR(-E2BIG)); } else { ipc_update_pid(&msq->q_lrpid, task_pid(msr->r_tsk)); msq->q_rtime = ktime_get_real_seconds(); wake_q_add(wake_q, msr->r_tsk); /* See expunge_all regarding memory barrier */ smp_store_release(&msr->r_msg, msg); return 1; } } } return 0; } static long do_msgsnd(int msqid, long mtype, void __user *mtext, size_t msgsz, int msgflg) { struct msg_queue *msq; struct msg_msg *msg; int err; struct ipc_namespace *ns; DEFINE_WAKE_Q(wake_q); ns = current->nsproxy->ipc_ns; if (msgsz > ns->msg_ctlmax || (long) msgsz < 0 || msqid < 0) return -EINVAL; if (mtype < 1) return -EINVAL; msg = load_msg(mtext, msgsz); if (IS_ERR(msg)) return PTR_ERR(msg); msg->m_type = mtype; msg->m_ts = msgsz; rcu_read_lock(); msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); goto out_unlock1; } ipc_lock_object(&msq->q_perm); for (;;) { struct msg_sender s; err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IWUGO)) goto out_unlock0; /* raced with RMID? */ if (!ipc_valid_object(&msq->q_perm)) { err = -EIDRM; goto out_unlock0; } err = security_msg_queue_msgsnd(&msq->q_perm, msg, msgflg); if (err) goto out_unlock0; if (msg_fits_inqueue(msq, msgsz)) break; /* queue full, wait: */ if (msgflg & IPC_NOWAIT) { err = -EAGAIN; goto out_unlock0; } /* enqueue the sender and prepare to block */ ss_add(msq, &s, msgsz); if (!ipc_rcu_getref(&msq->q_perm)) { err = -EIDRM; goto out_unlock0; } ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); schedule(); rcu_read_lock(); ipc_lock_object(&msq->q_perm); ipc_rcu_putref(&msq->q_perm, msg_rcu_free); /* raced with RMID? */ if (!ipc_valid_object(&msq->q_perm)) { err = -EIDRM; goto out_unlock0; } ss_del(&s); if (signal_pending(current)) { err = -ERESTARTNOHAND; goto out_unlock0; } } ipc_update_pid(&msq->q_lspid, task_tgid(current)); msq->q_stime = ktime_get_real_seconds(); if (!pipelined_send(msq, msg, &wake_q)) { /* no one is waiting for this message, enqueue it */ list_add_tail(&msg->m_list, &msq->q_messages); msq->q_cbytes += msgsz; msq->q_qnum++; percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz); percpu_counter_add_local(&ns->percpu_msg_hdrs, 1); } err = 0; msg = NULL; out_unlock0: ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); out_unlock1: rcu_read_unlock(); if (msg != NULL) free_msg(msg); return err; } long ksys_msgsnd(int msqid, struct msgbuf __user *msgp, size_t msgsz, int msgflg) { long mtype; if (get_user(mtype, &msgp->mtype)) return -EFAULT; return do_msgsnd(msqid, mtype, msgp->mtext, msgsz, msgflg); } SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, int, msgflg) { return ksys_msgsnd(msqid, msgp, msgsz, msgflg); } #ifdef CONFIG_COMPAT struct compat_msgbuf { compat_long_t mtype; char mtext[]; }; long compat_ksys_msgsnd(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, int msgflg) { struct compat_msgbuf __user *up = compat_ptr(msgp); compat_long_t mtype; if (get_user(mtype, &up->mtype)) return -EFAULT; return do_msgsnd(msqid, mtype, up->mtext, (ssize_t)msgsz, msgflg); } COMPAT_SYSCALL_DEFINE4(msgsnd, int, msqid, compat_uptr_t, msgp, compat_ssize_t, msgsz, int, msgflg) { return compat_ksys_msgsnd(msqid, msgp, msgsz, msgflg); } #endif static inline int convert_mode(long *msgtyp, int msgflg) { if (msgflg & MSG_COPY) return SEARCH_NUMBER; /* * find message of correct type. * msgtyp = 0 => get first. * msgtyp > 0 => get first message of matching type. * msgtyp < 0 => get message with least type must be < abs(msgtype). */ if (*msgtyp == 0) return SEARCH_ANY; if (*msgtyp < 0) { if (*msgtyp == LONG_MIN) /* -LONG_MIN is undefined */ *msgtyp = LONG_MAX; else *msgtyp = -*msgtyp; return SEARCH_LESSEQUAL; } if (msgflg & MSG_EXCEPT) return SEARCH_NOTEQUAL; return SEARCH_EQUAL; } static long do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) { struct msgbuf __user *msgp = dest; size_t msgsz; if (put_user(msg->m_type, &msgp->mtype)) return -EFAULT; msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz; if (store_msg(msgp->mtext, msg, msgsz)) return -EFAULT; return msgsz; } #ifdef CONFIG_CHECKPOINT_RESTORE /* * This function creates new kernel message structure, large enough to store * bufsz message bytes. */ static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) { struct msg_msg *copy; /* * Create dummy message to copy real message to. */ copy = load_msg(buf, bufsz); if (!IS_ERR(copy)) copy->m_ts = bufsz; return copy; } static inline void free_copy(struct msg_msg *copy) { if (copy) free_msg(copy); } #else static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) { return ERR_PTR(-ENOSYS); } static inline void free_copy(struct msg_msg *copy) { } #endif static struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode) { struct msg_msg *msg, *found = NULL; long count = 0; list_for_each_entry(msg, &msq->q_messages, m_list) { if (testmsg(msg, *msgtyp, mode) && !security_msg_queue_msgrcv(&msq->q_perm, msg, current, *msgtyp, mode)) { if (mode == SEARCH_LESSEQUAL && msg->m_type != 1) { *msgtyp = msg->m_type - 1; found = msg; } else if (mode == SEARCH_NUMBER) { if (*msgtyp == count) return msg; } else return msg; count++; } } return found ?: ERR_PTR(-EAGAIN); } static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg, long (*msg_handler)(void __user *, struct msg_msg *, size_t)) { int mode; struct msg_queue *msq; struct ipc_namespace *ns; struct msg_msg *msg, *copy = NULL; DEFINE_WAKE_Q(wake_q); ns = current->nsproxy->ipc_ns; if (msqid < 0 || (long) bufsz < 0) return -EINVAL; if (msgflg & MSG_COPY) { if ((msgflg & MSG_EXCEPT) || !(msgflg & IPC_NOWAIT)) return -EINVAL; copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax)); if (IS_ERR(copy)) return PTR_ERR(copy); } mode = convert_mode(&msgtyp, msgflg); rcu_read_lock(); msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { rcu_read_unlock(); free_copy(copy); return PTR_ERR(msq); } for (;;) { struct msg_receiver msr_d; msg = ERR_PTR(-EACCES); if (ipcperms(ns, &msq->q_perm, S_IRUGO)) goto out_unlock1; ipc_lock_object(&msq->q_perm); /* raced with RMID? */ if (!ipc_valid_object(&msq->q_perm)) { msg = ERR_PTR(-EIDRM); goto out_unlock0; } msg = find_msg(msq, &msgtyp, mode); if (!IS_ERR(msg)) { /* * Found a suitable message. * Unlink it from the queue. */ if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) { msg = ERR_PTR(-E2BIG); goto out_unlock0; } /* * If we are copying, then do not unlink message and do * not update queue parameters. */ if (msgflg & MSG_COPY) { msg = copy_msg(msg, copy); goto out_unlock0; } list_del(&msg->m_list); msq->q_qnum--; msq->q_rtime = ktime_get_real_seconds(); ipc_update_pid(&msq->q_lrpid, task_tgid(current)); msq->q_cbytes -= msg->m_ts; percpu_counter_sub_local(&ns->percpu_msg_bytes, msg->m_ts); percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); ss_wakeup(msq, &wake_q, false); goto out_unlock0; } /* No message waiting. Wait for a message */ if (msgflg & IPC_NOWAIT) { msg = ERR_PTR(-ENOMSG); goto out_unlock0; } list_add_tail(&msr_d.r_list, &msq->q_receivers); msr_d.r_tsk = current; msr_d.r_msgtype = msgtyp; msr_d.r_mode = mode; if (msgflg & MSG_NOERROR) msr_d.r_maxsize = INT_MAX; else msr_d.r_maxsize = bufsz; /* memory barrier not require due to ipc_lock_object() */ WRITE_ONCE(msr_d.r_msg, ERR_PTR(-EAGAIN)); /* memory barrier not required, we own ipc_lock_object() */ __set_current_state(TASK_INTERRUPTIBLE); ipc_unlock_object(&msq->q_perm); rcu_read_unlock(); schedule(); /* * Lockless receive, part 1: * We don't hold a reference to the queue and getting a * reference would defeat the idea of a lockless operation, * thus the code relies on rcu to guarantee the existence of * msq: * Prior to destruction, expunge_all(-EIRDM) changes r_msg. * Thus if r_msg is -EAGAIN, then the queue not yet destroyed. */ rcu_read_lock(); /* * Lockless receive, part 2: * The work in pipelined_send() and expunge_all(): * - Set pointer to message * - Queue the receiver task for later wakeup * - Wake up the process after the lock is dropped. * * Should the process wake up before this wakeup (due to a * signal) it will either see the message and continue ... */ msg = READ_ONCE(msr_d.r_msg); if (msg != ERR_PTR(-EAGAIN)) { /* see MSG_BARRIER for purpose/pairing */ smp_acquire__after_ctrl_dep(); goto out_unlock1; } /* * ... or see -EAGAIN, acquire the lock to check the message * again. */ ipc_lock_object(&msq->q_perm); msg = READ_ONCE(msr_d.r_msg); if (msg != ERR_PTR(-EAGAIN)) goto out_unlock0; list_del(&msr_d.r_list); if (signal_pending(current)) { msg = ERR_PTR(-ERESTARTNOHAND); goto out_unlock0; } ipc_unlock_object(&msq->q_perm); } out_unlock0: ipc_unlock_object(&msq->q_perm); wake_up_q(&wake_q); out_unlock1: rcu_read_unlock(); if (IS_ERR(msg)) { free_copy(copy); return PTR_ERR(msg); } bufsz = msg_handler(buf, msg, bufsz); free_msg(msg); return bufsz; } long ksys_msgrcv(int msqid, struct msgbuf __user *msgp, size_t msgsz, long msgtyp, int msgflg) { return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); } SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, long, msgtyp, int, msgflg) { return ksys_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg); } #ifdef CONFIG_COMPAT static long compat_do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) { struct compat_msgbuf __user *msgp = dest; size_t msgsz; if (put_user(msg->m_type, &msgp->mtype)) return -EFAULT; msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz; if (store_msg(msgp->mtext, msg, msgsz)) return -EFAULT; return msgsz; } long compat_ksys_msgrcv(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, compat_long_t msgtyp, int msgflg) { return do_msgrcv(msqid, compat_ptr(msgp), (ssize_t)msgsz, (long)msgtyp, msgflg, compat_do_msg_fill); } COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp, compat_ssize_t, msgsz, compat_long_t, msgtyp, int, msgflg) { return compat_ksys_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg); } #endif int msg_init_ns(struct ipc_namespace *ns) { int ret; ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; ns->msg_ctlmni = MSGMNI; ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL); if (ret) goto fail_msg_bytes; ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL); if (ret) goto fail_msg_hdrs; ipc_init_ids(&ns->ids[IPC_MSG_IDS]); return 0; fail_msg_hdrs: percpu_counter_destroy(&ns->percpu_msg_bytes); fail_msg_bytes: return ret; } #ifdef CONFIG_IPC_NS void msg_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); percpu_counter_destroy(&ns->percpu_msg_bytes); percpu_counter_destroy(&ns->percpu_msg_hdrs); } #endif #ifdef CONFIG_PROC_FS static int sysvipc_msg_proc_show(struct seq_file *s, void *it) { struct pid_namespace *pid_ns = ipc_seq_pid_ns(s); struct user_namespace *user_ns = seq_user_ns(s); struct kern_ipc_perm *ipcp = it; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); seq_printf(s, "%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10llu %10llu %10llu\n", msq->q_perm.key, msq->q_perm.id, msq->q_perm.mode, msq->q_cbytes, msq->q_qnum, pid_nr_ns(msq->q_lspid, pid_ns), pid_nr_ns(msq->q_lrpid, pid_ns), from_kuid_munged(user_ns, msq->q_perm.uid), from_kgid_munged(user_ns, msq->q_perm.gid), from_kuid_munged(user_ns, msq->q_perm.cuid), from_kgid_munged(user_ns, msq->q_perm.cgid), msq->q_stime, msq->q_rtime, msq->q_ctime); return 0; } #endif void __init msg_init(void) { msg_init_ns(&init_ipc_ns); ipc_init_proc_interface("sysvipc/msg", " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", IPC_MSG_IDS, sysvipc_msg_proc_show); }
1 1 1 7 6 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/debugfs.h> #include <linux/scatterlist.h> #include <crypto/aes.h> #include <crypto/hash.h> #include <crypto/kpp.h> #include <crypto/utils.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/mgmt.h> #include "ecdh_helper.h" #include "smp.h" #define SMP_DEV(hdev) \ ((struct smp_dev *)((struct l2cap_chan *)((hdev)->smp_data))->data) /* Low-level debug macros to be used for stuff that we don't want * accidentally in dmesg, i.e. the values of the various crypto keys * and the inputs & outputs of crypto functions. */ #ifdef DEBUG #define SMP_DBG(fmt, ...) printk(KERN_DEBUG "%s: " fmt, __func__, \ ##__VA_ARGS__) #else #define SMP_DBG(fmt, ...) no_printk(KERN_DEBUG "%s: " fmt, __func__, \ ##__VA_ARGS__) #endif #define SMP_ALLOW_CMD(smp, code) set_bit(code, &smp->allow_cmd) /* Keys which are not distributed with Secure Connections */ #define SMP_SC_NO_DIST (SMP_DIST_ENC_KEY | SMP_DIST_LINK_KEY) #define SMP_TIMEOUT msecs_to_jiffies(30000) #define ID_ADDR_TIMEOUT msecs_to_jiffies(200) #define AUTH_REQ_MASK(dev) (hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \ 0x3f : 0x07) #define KEY_DIST_MASK 0x07 /* Maximum message length that can be passed to aes_cmac */ #define CMAC_MSG_MAX 80 enum { SMP_FLAG_TK_VALID, SMP_FLAG_CFM_PENDING, SMP_FLAG_MITM_AUTH, SMP_FLAG_COMPLETE, SMP_FLAG_INITIATOR, SMP_FLAG_SC, SMP_FLAG_REMOTE_PK, SMP_FLAG_DEBUG_KEY, SMP_FLAG_WAIT_USER, SMP_FLAG_DHKEY_PENDING, SMP_FLAG_REMOTE_OOB, SMP_FLAG_LOCAL_OOB, SMP_FLAG_CT2, }; struct smp_dev { /* Secure Connections OOB data */ bool local_oob; u8 local_pk[64]; u8 local_rand[16]; bool debug_key; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; }; struct smp_chan { struct l2cap_conn *conn; struct delayed_work security_timer; unsigned long allow_cmd; /* Bitmask of allowed commands */ u8 preq[7]; /* SMP Pairing Request */ u8 prsp[7]; /* SMP Pairing Response */ u8 prnd[16]; /* SMP Pairing Random (local) */ u8 rrnd[16]; /* SMP Pairing Random (remote) */ u8 pcnf[16]; /* SMP Pairing Confirm */ u8 tk[16]; /* SMP Temporary Key */ u8 rr[16]; /* Remote OOB ra/rb value */ u8 lr[16]; /* Local OOB ra/rb value */ u8 enc_key_size; u8 remote_key_dist; bdaddr_t id_addr; u8 id_addr_type; u8 irk[16]; struct smp_csrk *csrk; struct smp_csrk *responder_csrk; struct smp_ltk *ltk; struct smp_ltk *responder_ltk; struct smp_irk *remote_irk; u8 *link_key; unsigned long flags; u8 method; u8 passkey_round; /* Secure Connections variables */ u8 local_pk[64]; u8 remote_pk[64]; u8 dhkey[32]; u8 mackey[16]; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; }; /* These debug key values are defined in the SMP section of the core * specification. debug_pk is the public debug key and debug_sk the * private debug key. */ static const u8 debug_pk[64] = { 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc, 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef, 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e, 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20, 0x8b, 0xd2, 0x89, 0x15, 0xd0, 0x8e, 0x1c, 0x74, 0x24, 0x30, 0xed, 0x8f, 0xc2, 0x45, 0x63, 0x76, 0x5c, 0x15, 0x52, 0x5a, 0xbf, 0x9a, 0x32, 0x63, 0x6d, 0xeb, 0x2a, 0x65, 0x49, 0x9c, 0x80, 0xdc, }; static const u8 debug_sk[32] = { 0xbd, 0x1a, 0x3c, 0xcd, 0xa6, 0xb8, 0x99, 0x58, 0x99, 0xb7, 0x40, 0xeb, 0x7b, 0x60, 0xff, 0x4a, 0x50, 0x3f, 0x10, 0xd2, 0xe3, 0xb3, 0xc9, 0x74, 0x38, 0x5f, 0xc5, 0xa3, 0xd4, 0xf6, 0x49, 0x3f, }; static inline void swap_buf(const u8 *src, u8 *dst, size_t len) { size_t i; for (i = 0; i < len; i++) dst[len - 1 - i] = src[i]; } /* The following functions map to the LE SC SMP crypto functions * AES-CMAC, f4, f5, f6, g2 and h6. */ static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m, size_t len, u8 mac[16]) { uint8_t tmp[16], mac_msb[16], msg_msb[CMAC_MSG_MAX]; int err; if (len > CMAC_MSG_MAX) return -EFBIG; if (!tfm) { BT_ERR("tfm %p", tfm); return -EINVAL; } /* Swap key and message from LSB to MSB */ swap_buf(k, tmp, 16); swap_buf(m, msg_msb, len); SMP_DBG("msg (len %zu) %*phN", len, (int) len, m); SMP_DBG("key %16phN", k); err = crypto_shash_setkey(tfm, tmp, 16); if (err) { BT_ERR("cipher setkey failed: %d", err); return err; } err = crypto_shash_tfm_digest(tfm, msg_msb, len, mac_msb); if (err) { BT_ERR("Hash computation error %d", err); return err; } swap_buf(mac_msb, mac, 16); SMP_DBG("mac %16phN", mac); return 0; } static int smp_f4(struct crypto_shash *tfm_cmac, const u8 u[32], const u8 v[32], const u8 x[16], u8 z, u8 res[16]) { u8 m[65]; int err; SMP_DBG("u %32phN", u); SMP_DBG("v %32phN", v); SMP_DBG("x %16phN z %02x", x, z); m[0] = z; memcpy(m + 1, v, 32); memcpy(m + 33, u, 32); err = aes_cmac(tfm_cmac, x, m, sizeof(m), res); if (err) return err; SMP_DBG("res %16phN", res); return err; } static int smp_f5(struct crypto_shash *tfm_cmac, const u8 w[32], const u8 n1[16], const u8 n2[16], const u8 a1[7], const u8 a2[7], u8 mackey[16], u8 ltk[16]) { /* The btle, salt and length "magic" values are as defined in * the SMP section of the Bluetooth core specification. In ASCII * the btle value ends up being 'btle'. The salt is just a * random number whereas length is the value 256 in little * endian format. */ const u8 btle[4] = { 0x65, 0x6c, 0x74, 0x62 }; const u8 salt[16] = { 0xbe, 0x83, 0x60, 0x5a, 0xdb, 0x0b, 0x37, 0x60, 0x38, 0xa5, 0xf5, 0xaa, 0x91, 0x83, 0x88, 0x6c }; const u8 length[2] = { 0x00, 0x01 }; u8 m[53], t[16]; int err; SMP_DBG("w %32phN", w); SMP_DBG("n1 %16phN n2 %16phN", n1, n2); SMP_DBG("a1 %7phN a2 %7phN", a1, a2); err = aes_cmac(tfm_cmac, salt, w, 32, t); if (err) return err; SMP_DBG("t %16phN", t); memcpy(m, length, 2); memcpy(m + 2, a2, 7); memcpy(m + 9, a1, 7); memcpy(m + 16, n2, 16); memcpy(m + 32, n1, 16); memcpy(m + 48, btle, 4); m[52] = 0; /* Counter */ err = aes_cmac(tfm_cmac, t, m, sizeof(m), mackey); if (err) return err; SMP_DBG("mackey %16phN", mackey); m[52] = 1; /* Counter */ err = aes_cmac(tfm_cmac, t, m, sizeof(m), ltk); if (err) return err; SMP_DBG("ltk %16phN", ltk); return 0; } static int smp_f6(struct crypto_shash *tfm_cmac, const u8 w[16], const u8 n1[16], const u8 n2[16], const u8 r[16], const u8 io_cap[3], const u8 a1[7], const u8 a2[7], u8 res[16]) { u8 m[65]; int err; SMP_DBG("w %16phN", w); SMP_DBG("n1 %16phN n2 %16phN", n1, n2); SMP_DBG("r %16phN io_cap %3phN a1 %7phN a2 %7phN", r, io_cap, a1, a2); memcpy(m, a2, 7); memcpy(m + 7, a1, 7); memcpy(m + 14, io_cap, 3); memcpy(m + 17, r, 16); memcpy(m + 33, n2, 16); memcpy(m + 49, n1, 16); err = aes_cmac(tfm_cmac, w, m, sizeof(m), res); if (err) return err; SMP_DBG("res %16phN", res); return err; } static int smp_g2(struct crypto_shash *tfm_cmac, const u8 u[32], const u8 v[32], const u8 x[16], const u8 y[16], u32 *val) { u8 m[80], tmp[16]; int err; SMP_DBG("u %32phN", u); SMP_DBG("v %32phN", v); SMP_DBG("x %16phN y %16phN", x, y); memcpy(m, y, 16); memcpy(m + 16, v, 32); memcpy(m + 48, u, 32); err = aes_cmac(tfm_cmac, x, m, sizeof(m), tmp); if (err) return err; *val = get_unaligned_le32(tmp); *val %= 1000000; SMP_DBG("val %06u", *val); return 0; } static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16], const u8 key_id[4], u8 res[16]) { int err; SMP_DBG("w %16phN key_id %4phN", w, key_id); err = aes_cmac(tfm_cmac, w, key_id, 4, res); if (err) return err; SMP_DBG("res %16phN", res); return err; } static int smp_h7(struct crypto_shash *tfm_cmac, const u8 w[16], const u8 salt[16], u8 res[16]) { int err; SMP_DBG("w %16phN salt %16phN", w, salt); err = aes_cmac(tfm_cmac, salt, w, 16, res); if (err) return err; SMP_DBG("res %16phN", res); return err; } /* The following functions map to the legacy SMP crypto functions e, c1, * s1 and ah. */ static int smp_e(const u8 *k, u8 *r) { struct crypto_aes_ctx ctx; uint8_t tmp[16], data[16]; int err; SMP_DBG("k %16phN r %16phN", k, r); /* The most significant octet of key corresponds to k[0] */ swap_buf(k, tmp, 16); err = aes_expandkey(&ctx, tmp, 16); if (err) { BT_ERR("cipher setkey failed: %d", err); return err; } /* Most significant octet of plaintextData corresponds to data[0] */ swap_buf(r, data, 16); aes_encrypt(&ctx, data, data); /* Most significant octet of encryptedData corresponds to data[0] */ swap_buf(data, r, 16); SMP_DBG("r %16phN", r); memzero_explicit(&ctx, sizeof(ctx)); return err; } static int smp_c1(const u8 k[16], const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat, const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16]) { u8 p1[16], p2[16]; int err; SMP_DBG("k %16phN r %16phN", k, r); SMP_DBG("iat %u ia %6phN rat %u ra %6phN", _iat, ia, _rat, ra); SMP_DBG("preq %7phN pres %7phN", preq, pres); memset(p1, 0, 16); /* p1 = pres || preq || _rat || _iat */ p1[0] = _iat; p1[1] = _rat; memcpy(p1 + 2, preq, 7); memcpy(p1 + 9, pres, 7); SMP_DBG("p1 %16phN", p1); /* res = r XOR p1 */ crypto_xor_cpy(res, r, p1, sizeof(p1)); /* res = e(k, res) */ err = smp_e(k, res); if (err) { BT_ERR("Encrypt data error"); return err; } /* p2 = padding || ia || ra */ memcpy(p2, ra, 6); memcpy(p2 + 6, ia, 6); memset(p2 + 12, 0, 4); SMP_DBG("p2 %16phN", p2); /* res = res XOR p2 */ crypto_xor(res, p2, sizeof(p2)); /* res = e(k, res) */ err = smp_e(k, res); if (err) BT_ERR("Encrypt data error"); return err; } static int smp_s1(const u8 k[16], const u8 r1[16], const u8 r2[16], u8 _r[16]) { int err; /* Just least significant octets from r1 and r2 are considered */ memcpy(_r, r2, 8); memcpy(_r + 8, r1, 8); err = smp_e(k, _r); if (err) BT_ERR("Encrypt data error"); return err; } static int smp_ah(const u8 irk[16], const u8 r[3], u8 res[3]) { u8 _res[16]; int err; /* r' = padding || r */ memcpy(_res, r, 3); memset(_res + 3, 0, 13); err = smp_e(irk, _res); if (err) { BT_ERR("Encrypt error"); return err; } /* The output of the random address function ah is: * ah(k, r) = e(k, r') mod 2^24 * The output of the security function e is then truncated to 24 bits * by taking the least significant 24 bits of the output of e as the * result of ah. */ memcpy(res, _res, 3); return 0; } bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], const bdaddr_t *bdaddr) { struct l2cap_chan *chan = hdev->smp_data; u8 hash[3]; int err; if (!chan || !chan->data) return false; bt_dev_dbg(hdev, "RPA %pMR IRK %*phN", bdaddr, 16, irk); err = smp_ah(irk, &bdaddr->b[3], hash); if (err) return false; return !crypto_memneq(bdaddr->b, hash, 3); } int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa) { struct l2cap_chan *chan = hdev->smp_data; int err; if (!chan || !chan->data) return -EOPNOTSUPP; get_random_bytes(&rpa->b[3], 3); rpa->b[5] &= 0x3f; /* Clear two most significant bits */ rpa->b[5] |= 0x40; /* Set second most significant bit */ err = smp_ah(irk, &rpa->b[3], rpa->b); if (err < 0) return err; bt_dev_dbg(hdev, "RPA %pMR", rpa); return 0; } int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) { struct l2cap_chan *chan = hdev->smp_data; struct smp_dev *smp; int err; if (!chan || !chan->data) return -EOPNOTSUPP; smp = chan->data; if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { bt_dev_dbg(hdev, "Using debug keys"); err = set_ecdh_privkey(smp->tfm_ecdh, debug_sk); if (err) return err; memcpy(smp->local_pk, debug_pk, 64); smp->debug_key = true; } else { while (true) { /* Generate key pair for Secure Connections */ err = generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk); if (err) return err; /* This is unlikely, but we need to check that * we didn't accidentally generate a debug key. */ if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; } smp->debug_key = false; } SMP_DBG("OOB Public Key X: %32phN", smp->local_pk); SMP_DBG("OOB Public Key Y: %32phN", smp->local_pk + 32); get_random_bytes(smp->local_rand, 16); err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->local_pk, smp->local_rand, 0, hash); if (err < 0) return err; memcpy(rand, smp->local_rand, 16); smp->local_oob = true; return 0; } static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; struct kvec iv[2]; struct msghdr msg; if (!chan) return; bt_dev_dbg(conn->hcon->hdev, "code 0x%2.2x", code); iv[0].iov_base = &code; iv[0].iov_len = 1; iv[1].iov_base = data; iv[1].iov_len = len; memset(&msg, 0, sizeof(msg)); iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iv, 2, 1 + len); l2cap_chan_send(chan, &msg, 1 + len); if (!chan->data) return; smp = chan->data; cancel_delayed_work_sync(&smp->security_timer); schedule_delayed_work(&smp->security_timer, SMP_TIMEOUT); } static u8 authreq_to_seclevel(u8 authreq) { if (authreq & SMP_AUTH_MITM) { if (authreq & SMP_AUTH_SC) return BT_SECURITY_FIPS; else return BT_SECURITY_HIGH; } else { return BT_SECURITY_MEDIUM; } } static __u8 seclevel_to_authreq(__u8 sec_level) { switch (sec_level) { case BT_SECURITY_FIPS: case BT_SECURITY_HIGH: return SMP_AUTH_MITM | SMP_AUTH_BONDING; case BT_SECURITY_MEDIUM: return SMP_AUTH_BONDING; default: return SMP_AUTH_NONE; } } static void build_pairing_cmd(struct l2cap_conn *conn, struct smp_cmd_pairing *req, struct smp_cmd_pairing *rsp, __u8 authreq) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; u8 local_dist = 0, remote_dist = 0, oob_flag = SMP_OOB_NOT_PRESENT; if (hci_dev_test_flag(hdev, HCI_BONDABLE)) { local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; authreq |= SMP_AUTH_BONDING; } else { authreq &= ~SMP_AUTH_BONDING; } if (hci_dev_test_flag(hdev, HCI_RPA_RESOLVING)) remote_dist |= SMP_DIST_ID_KEY; if (hci_dev_test_flag(hdev, HCI_PRIVACY)) local_dist |= SMP_DIST_ID_KEY; if (hci_dev_test_flag(hdev, HCI_SC_ENABLED) && (authreq & SMP_AUTH_SC)) { struct oob_data *oob_data; u8 bdaddr_type; if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) { local_dist |= SMP_DIST_LINK_KEY; remote_dist |= SMP_DIST_LINK_KEY; } if (hcon->dst_type == ADDR_LE_DEV_PUBLIC) bdaddr_type = BDADDR_LE_PUBLIC; else bdaddr_type = BDADDR_LE_RANDOM; oob_data = hci_find_remote_oob_data(hdev, &hcon->dst, bdaddr_type); if (oob_data && oob_data->present) { set_bit(SMP_FLAG_REMOTE_OOB, &smp->flags); oob_flag = SMP_OOB_PRESENT; memcpy(smp->rr, oob_data->rand256, 16); memcpy(smp->pcnf, oob_data->hash256, 16); SMP_DBG("OOB Remote Confirmation: %16phN", smp->pcnf); SMP_DBG("OOB Remote Random: %16phN", smp->rr); } } else { authreq &= ~SMP_AUTH_SC; } if (rsp == NULL) { req->io_capability = conn->hcon->io_capability; req->oob_flag = oob_flag; req->max_key_size = hdev->le_max_key_size; req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; req->auth_req = (authreq & AUTH_REQ_MASK(hdev)); smp->remote_key_dist = remote_dist; return; } rsp->io_capability = conn->hcon->io_capability; rsp->oob_flag = oob_flag; rsp->max_key_size = hdev->le_max_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev)); smp->remote_key_dist = rsp->init_key_dist; } static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) { struct l2cap_chan *chan = conn->smp; struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp = chan->data; if (conn->hcon->pending_sec_level == BT_SECURITY_FIPS && max_key_size != SMP_MAX_ENC_KEY_SIZE) return SMP_ENC_KEY_SIZE; if (max_key_size > hdev->le_max_key_size || max_key_size < SMP_MIN_ENC_KEY_SIZE) return SMP_ENC_KEY_SIZE; smp->enc_key_size = max_key_size; return 0; } static void smp_chan_destroy(struct l2cap_conn *conn) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; bool complete; BUG_ON(!smp); cancel_delayed_work_sync(&smp->security_timer); complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags); mgmt_smp_complete(hcon, complete); kfree_sensitive(smp->csrk); kfree_sensitive(smp->responder_csrk); kfree_sensitive(smp->link_key); crypto_free_shash(smp->tfm_cmac); crypto_free_kpp(smp->tfm_ecdh); /* Ensure that we don't leave any debug key around if debug key * support hasn't been explicitly enabled. */ if (smp->ltk && smp->ltk->type == SMP_LTK_P256_DEBUG && !hci_dev_test_flag(hcon->hdev, HCI_KEEP_DEBUG_KEYS)) { list_del_rcu(&smp->ltk->list); kfree_rcu(smp->ltk, rcu); smp->ltk = NULL; } /* If pairing failed clean up any keys we might have */ if (!complete) { if (smp->ltk) { list_del_rcu(&smp->ltk->list); kfree_rcu(smp->ltk, rcu); } if (smp->responder_ltk) { list_del_rcu(&smp->responder_ltk->list); kfree_rcu(smp->responder_ltk, rcu); } if (smp->remote_irk) { list_del_rcu(&smp->remote_irk->list); kfree_rcu(smp->remote_irk, rcu); } } chan->data = NULL; kfree_sensitive(smp); hci_conn_drop(hcon); } static void smp_failure(struct l2cap_conn *conn, u8 reason) { struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; if (reason) smp_send_cmd(conn, SMP_CMD_PAIRING_FAIL, sizeof(reason), &reason); mgmt_auth_failed(hcon, HCI_ERROR_AUTH_FAILURE); if (chan->data) smp_chan_destroy(conn); } #define JUST_WORKS 0x00 #define JUST_CFM 0x01 #define REQ_PASSKEY 0x02 #define CFM_PASSKEY 0x03 #define REQ_OOB 0x04 #define DSP_PASSKEY 0x05 #define OVERLAP 0xFF static const u8 gen_method[5][5] = { { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY }, { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY }, { CFM_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY }, { JUST_WORKS, JUST_CFM, JUST_WORKS, JUST_WORKS, JUST_CFM }, { CFM_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, OVERLAP }, }; static const u8 sc_method[5][5] = { { JUST_WORKS, JUST_CFM, REQ_PASSKEY, JUST_WORKS, REQ_PASSKEY }, { JUST_WORKS, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY }, { DSP_PASSKEY, DSP_PASSKEY, REQ_PASSKEY, JUST_WORKS, DSP_PASSKEY }, { JUST_WORKS, JUST_CFM, JUST_WORKS, JUST_WORKS, JUST_CFM }, { DSP_PASSKEY, CFM_PASSKEY, REQ_PASSKEY, JUST_WORKS, CFM_PASSKEY }, }; static u8 get_auth_method(struct smp_chan *smp, u8 local_io, u8 remote_io) { /* If either side has unknown io_caps, use JUST_CFM (which gets * converted later to JUST_WORKS if we're initiators. */ if (local_io > SMP_IO_KEYBOARD_DISPLAY || remote_io > SMP_IO_KEYBOARD_DISPLAY) return JUST_CFM; if (test_bit(SMP_FLAG_SC, &smp->flags)) return sc_method[remote_io][local_io]; return gen_method[remote_io][local_io]; } static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, u8 local_io, u8 remote_io) { struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; u32 passkey = 0; int ret; /* Initialize key for JUST WORKS */ memset(smp->tk, 0, sizeof(smp->tk)); clear_bit(SMP_FLAG_TK_VALID, &smp->flags); bt_dev_dbg(hcon->hdev, "auth:%u lcl:%u rem:%u", auth, local_io, remote_io); /* If neither side wants MITM, either "just" confirm an incoming * request or use just-works for outgoing ones. The JUST_CFM * will be converted to JUST_WORKS if necessary later in this * function. If either side has MITM look up the method from the * table. */ if (!(auth & SMP_AUTH_MITM)) smp->method = JUST_CFM; else smp->method = get_auth_method(smp, local_io, remote_io); /* Don't confirm locally initiated pairing attempts */ if (smp->method == JUST_CFM && test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp->method = JUST_WORKS; /* Don't bother user space with no IO capabilities */ if (smp->method == JUST_CFM && hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) smp->method = JUST_WORKS; /* If Just Works, Continue with Zero TK and ask user-space for * confirmation */ if (smp->method == JUST_WORKS) { ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, 1); if (ret) return ret; set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } /* If this function is used for SC -> legacy fallback we * can only recover the just-works case. */ if (test_bit(SMP_FLAG_SC, &smp->flags)) return -EINVAL; /* Not Just Works/Confirm results in MITM Authentication */ if (smp->method != JUST_CFM) { set_bit(SMP_FLAG_MITM_AUTH, &smp->flags); if (hcon->pending_sec_level < BT_SECURITY_HIGH) hcon->pending_sec_level = BT_SECURITY_HIGH; } /* If both devices have Keyboard-Display I/O, the initiator * Confirms and the responder Enters the passkey. */ if (smp->method == OVERLAP) { if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp->method = CFM_PASSKEY; else smp->method = REQ_PASSKEY; } /* Generate random passkey. */ if (smp->method == CFM_PASSKEY) { memset(smp->tk, 0, sizeof(smp->tk)); get_random_bytes(&passkey, sizeof(passkey)); passkey %= 1000000; put_unaligned_le32(passkey, smp->tk); bt_dev_dbg(hcon->hdev, "PassKey: %u", passkey); set_bit(SMP_FLAG_TK_VALID, &smp->flags); } if (smp->method == REQ_PASSKEY) ret = mgmt_user_passkey_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type); else if (smp->method == JUST_CFM) ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, 1); else ret = mgmt_user_passkey_notify(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, 0); return ret; } static u8 smp_confirm(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; struct smp_cmd_pairing_confirm cp; int ret; bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); ret = smp_c1(smp->tk, smp->prnd, smp->preq, smp->prsp, conn->hcon->init_addr_type, &conn->hcon->init_addr, conn->hcon->resp_addr_type, &conn->hcon->resp_addr, cp.confirm_val); if (ret) return SMP_UNSPECIFIED; clear_bit(SMP_FLAG_CFM_PENDING, &smp->flags); smp_send_cmd(smp->conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); else SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); return 0; } static u8 smp_random(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; u8 confirm[16]; int ret; bt_dev_dbg(conn->hcon->hdev, "conn %p %s", conn, test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" : "responder"); ret = smp_c1(smp->tk, smp->rrnd, smp->preq, smp->prsp, hcon->init_addr_type, &hcon->init_addr, hcon->resp_addr_type, &hcon->resp_addr, confirm); if (ret) return SMP_UNSPECIFIED; if (crypto_memneq(smp->pcnf, confirm, sizeof(smp->pcnf))) { bt_dev_err(hcon->hdev, "pairing failed " "(confirmation values mismatch)"); return SMP_CONFIRM_FAILED; } if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { u8 stk[16]; __le64 rand = 0; __le16 ediv = 0; smp_s1(smp->tk, smp->rrnd, smp->prnd, stk); if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) return SMP_UNSPECIFIED; hci_le_start_enc(hcon, ediv, rand, stk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; set_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags); } else { u8 stk[16], auth; __le64 rand = 0; __le16 ediv = 0; smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); smp_s1(smp->tk, smp->prnd, smp->rrnd, stk); if (hcon->pending_sec_level == BT_SECURITY_HIGH) auth = 1; else auth = 0; /* Even though there's no _RESPONDER suffix this is the * responder STK we're adding for later lookup (the initiator * STK never needs to be stored). */ hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, SMP_STK, auth, stk, smp->enc_key_size, ediv, rand); } return 0; } static void smp_notify_keys(struct l2cap_conn *conn) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_cmd_pairing *req = (void *) &smp->preq[1]; struct smp_cmd_pairing *rsp = (void *) &smp->prsp[1]; bool persistent; if (hcon->type == ACL_LINK) { if (hcon->key_type == HCI_LK_DEBUG_COMBINATION) persistent = false; else persistent = !test_bit(HCI_CONN_FLUSH_KEY, &hcon->flags); } else { /* The LTKs, IRKs and CSRKs should be persistent only if * both sides had the bonding bit set in their * authentication requests. */ persistent = !!((req->auth_req & rsp->auth_req) & SMP_AUTH_BONDING); } if (smp->remote_irk) { mgmt_new_irk(hdev, smp->remote_irk, persistent); /* Now that user space can be considered to know the * identity address track the connection based on it * from now on (assuming this is an LE link). */ if (hcon->type == LE_LINK) { bacpy(&hcon->dst, &smp->remote_irk->bdaddr); hcon->dst_type = smp->remote_irk->addr_type; /* Use a short delay to make sure the new address is * propagated _before_ the channels. */ queue_delayed_work(hdev->workqueue, &conn->id_addr_timer, ID_ADDR_TIMEOUT); } } if (smp->csrk) { smp->csrk->bdaddr_type = hcon->dst_type; bacpy(&smp->csrk->bdaddr, &hcon->dst); mgmt_new_csrk(hdev, smp->csrk, persistent); } if (smp->responder_csrk) { smp->responder_csrk->bdaddr_type = hcon->dst_type; bacpy(&smp->responder_csrk->bdaddr, &hcon->dst); mgmt_new_csrk(hdev, smp->responder_csrk, persistent); } if (smp->ltk) { smp->ltk->bdaddr_type = hcon->dst_type; bacpy(&smp->ltk->bdaddr, &hcon->dst); mgmt_new_ltk(hdev, smp->ltk, persistent); } if (smp->responder_ltk) { smp->responder_ltk->bdaddr_type = hcon->dst_type; bacpy(&smp->responder_ltk->bdaddr, &hcon->dst); mgmt_new_ltk(hdev, smp->responder_ltk, persistent); } if (smp->link_key) { struct link_key *key; u8 type; if (test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags)) type = HCI_LK_DEBUG_COMBINATION; else if (hcon->sec_level == BT_SECURITY_FIPS) type = HCI_LK_AUTH_COMBINATION_P256; else type = HCI_LK_UNAUTH_COMBINATION_P256; key = hci_add_link_key(hdev, smp->conn->hcon, &hcon->dst, smp->link_key, type, 0, &persistent); if (key) { mgmt_new_link_key(hdev, key, persistent); /* Don't keep debug keys around if the relevant * flag is not set. */ if (!hci_dev_test_flag(hdev, HCI_KEEP_DEBUG_KEYS) && key->type == HCI_LK_DEBUG_COMBINATION) { list_del_rcu(&key->list); kfree_rcu(key, rcu); } } } } static void sc_add_ltk(struct smp_chan *smp) { struct hci_conn *hcon = smp->conn->hcon; u8 key_type, auth; if (test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags)) key_type = SMP_LTK_P256_DEBUG; else key_type = SMP_LTK_P256; if (hcon->pending_sec_level == BT_SECURITY_FIPS) auth = 1; else auth = 0; smp->ltk = hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, key_type, auth, smp->tk, smp->enc_key_size, 0, 0); } static void sc_generate_link_key(struct smp_chan *smp) { /* From core spec. Spells out in ASCII as 'lebr'. */ const u8 lebr[4] = { 0x72, 0x62, 0x65, 0x6c }; smp->link_key = kzalloc(16, GFP_KERNEL); if (!smp->link_key) return; if (test_bit(SMP_FLAG_CT2, &smp->flags)) { /* SALT = 0x000000000000000000000000746D7031 */ const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 }; if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) { kfree_sensitive(smp->link_key); smp->link_key = NULL; return; } } else { /* From core spec. Spells out in ASCII as 'tmp1'. */ const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 }; if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) { kfree_sensitive(smp->link_key); smp->link_key = NULL; return; } } if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) { kfree_sensitive(smp->link_key); smp->link_key = NULL; return; } } static void smp_allow_key_dist(struct smp_chan *smp) { /* Allow the first expected phase 3 PDU. The rest of the PDUs * will be allowed in each PDU handler to ensure we receive * them in the correct order. */ if (smp->remote_key_dist & SMP_DIST_ENC_KEY) SMP_ALLOW_CMD(smp, SMP_CMD_ENCRYPT_INFO); else if (smp->remote_key_dist & SMP_DIST_ID_KEY) SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO); else if (smp->remote_key_dist & SMP_DIST_SIGN) SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); } static void sc_generate_ltk(struct smp_chan *smp) { /* From core spec. Spells out in ASCII as 'brle'. */ const u8 brle[4] = { 0x65, 0x6c, 0x72, 0x62 }; struct hci_conn *hcon = smp->conn->hcon; struct hci_dev *hdev = hcon->hdev; struct link_key *key; key = hci_find_link_key(hdev, &hcon->dst); if (!key) { bt_dev_err(hdev, "no Link Key found to generate LTK"); return; } if (key->type == HCI_LK_DEBUG_COMBINATION) set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); if (test_bit(SMP_FLAG_CT2, &smp->flags)) { /* SALT = 0x000000000000000000000000746D7032 */ const u8 salt[16] = { 0x32, 0x70, 0x6d, 0x74 }; if (smp_h7(smp->tfm_cmac, key->val, salt, smp->tk)) return; } else { /* From core spec. Spells out in ASCII as 'tmp2'. */ const u8 tmp2[4] = { 0x32, 0x70, 0x6d, 0x74 }; if (smp_h6(smp->tfm_cmac, key->val, tmp2, smp->tk)) return; } if (smp_h6(smp->tfm_cmac, smp->tk, brle, smp->tk)) return; sc_add_ltk(smp); } static void smp_distribute_keys(struct smp_chan *smp) { struct smp_cmd_pairing *req, *rsp; struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; __u8 *keydist; bt_dev_dbg(hdev, "conn %p", conn); rsp = (void *) &smp->prsp[1]; /* The responder sends its keys first */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags) && (smp->remote_key_dist & KEY_DIST_MASK)) { smp_allow_key_dist(smp); return; } req = (void *) &smp->preq[1]; if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { keydist = &rsp->init_key_dist; *keydist &= req->init_key_dist; } else { keydist = &rsp->resp_key_dist; *keydist &= req->resp_key_dist; } if (test_bit(SMP_FLAG_SC, &smp->flags)) { if (hcon->type == LE_LINK && (*keydist & SMP_DIST_LINK_KEY)) sc_generate_link_key(smp); if (hcon->type == ACL_LINK && (*keydist & SMP_DIST_ENC_KEY)) sc_generate_ltk(smp); /* Clear the keys which are generated but not distributed */ *keydist &= ~SMP_SC_NO_DIST; } bt_dev_dbg(hdev, "keydist 0x%x", *keydist); if (*keydist & SMP_DIST_ENC_KEY) { struct smp_cmd_encrypt_info enc; struct smp_cmd_initiator_ident ident; struct smp_ltk *ltk; u8 authenticated; __le16 ediv; __le64 rand; /* Make sure we generate only the significant amount of * bytes based on the encryption key size, and set the rest * of the value to zeroes. */ get_random_bytes(enc.ltk, smp->enc_key_size); memset(enc.ltk + smp->enc_key_size, 0, sizeof(enc.ltk) - smp->enc_key_size); get_random_bytes(&ediv, sizeof(ediv)); get_random_bytes(&rand, sizeof(rand)); smp_send_cmd(conn, SMP_CMD_ENCRYPT_INFO, sizeof(enc), &enc); authenticated = hcon->sec_level == BT_SECURITY_HIGH; ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, SMP_LTK_RESPONDER, authenticated, enc.ltk, smp->enc_key_size, ediv, rand); smp->responder_ltk = ltk; ident.ediv = ediv; ident.rand = rand; smp_send_cmd(conn, SMP_CMD_INITIATOR_IDENT, sizeof(ident), &ident); *keydist &= ~SMP_DIST_ENC_KEY; } if (*keydist & SMP_DIST_ID_KEY) { struct smp_cmd_ident_addr_info addrinfo; struct smp_cmd_ident_info idinfo; memcpy(idinfo.irk, hdev->irk, sizeof(idinfo.irk)); smp_send_cmd(conn, SMP_CMD_IDENT_INFO, sizeof(idinfo), &idinfo); /* The hci_conn contains the local identity address * after the connection has been established. * * This is true even when the connection has been * established using a resolvable random address. */ bacpy(&addrinfo.bdaddr, &hcon->src); addrinfo.addr_type = hcon->src_type; smp_send_cmd(conn, SMP_CMD_IDENT_ADDR_INFO, sizeof(addrinfo), &addrinfo); *keydist &= ~SMP_DIST_ID_KEY; } if (*keydist & SMP_DIST_SIGN) { struct smp_cmd_sign_info sign; struct smp_csrk *csrk; /* Generate a new random key */ get_random_bytes(sign.csrk, sizeof(sign.csrk)); csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); if (csrk) { if (hcon->sec_level > BT_SECURITY_MEDIUM) csrk->type = MGMT_CSRK_LOCAL_AUTHENTICATED; else csrk->type = MGMT_CSRK_LOCAL_UNAUTHENTICATED; memcpy(csrk->val, sign.csrk, sizeof(csrk->val)); } smp->responder_csrk = csrk; smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign); *keydist &= ~SMP_DIST_SIGN; } /* If there are still keys to be received wait for them */ if (smp->remote_key_dist & KEY_DIST_MASK) { smp_allow_key_dist(smp); return; } set_bit(SMP_FLAG_COMPLETE, &smp->flags); smp_notify_keys(conn); smp_chan_destroy(conn); } static void smp_timeout(struct work_struct *work) { struct smp_chan *smp = container_of(work, struct smp_chan, security_timer.work); struct l2cap_conn *conn = smp->conn; bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); hci_disconnect(conn->hcon, HCI_ERROR_REMOTE_USER_TERM); } static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) { struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; smp = kzalloc(sizeof(*smp), GFP_ATOMIC); if (!smp) return NULL; smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(smp->tfm_cmac)) { bt_dev_err(hcon->hdev, "Unable to create CMAC crypto context"); goto zfree_smp; } smp->tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0); if (IS_ERR(smp->tfm_ecdh)) { bt_dev_err(hcon->hdev, "Unable to create ECDH crypto context"); goto free_shash; } smp->conn = conn; chan->data = smp; SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_FAIL); INIT_DELAYED_WORK(&smp->security_timer, smp_timeout); hci_conn_hold(hcon); return smp; free_shash: crypto_free_shash(smp->tfm_cmac); zfree_smp: kfree_sensitive(smp); return NULL; } static int sc_mackey_and_ltk(struct smp_chan *smp, u8 mackey[16], u8 ltk[16]) { struct hci_conn *hcon = smp->conn->hcon; u8 *na, *nb, a[7], b[7]; if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { na = smp->prnd; nb = smp->rrnd; } else { na = smp->rrnd; nb = smp->prnd; } memcpy(a, &hcon->init_addr, 6); memcpy(b, &hcon->resp_addr, 6); a[6] = hcon->init_addr_type; b[6] = hcon->resp_addr_type; return smp_f5(smp->tfm_cmac, smp->dhkey, na, nb, a, b, mackey, ltk); } static void sc_dhkey_check(struct smp_chan *smp) { struct hci_conn *hcon = smp->conn->hcon; struct smp_cmd_dhkey_check check; u8 a[7], b[7], *local_addr, *remote_addr; u8 io_cap[3], r[16]; memcpy(a, &hcon->init_addr, 6); memcpy(b, &hcon->resp_addr, 6); a[6] = hcon->init_addr_type; b[6] = hcon->resp_addr_type; if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local_addr = a; remote_addr = b; memcpy(io_cap, &smp->preq[1], 3); } else { local_addr = b; remote_addr = a; memcpy(io_cap, &smp->prsp[1], 3); } memset(r, 0, sizeof(r)); if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) put_unaligned_le32(hcon->passkey_notify, r); if (smp->method == REQ_OOB) memcpy(r, smp->rr, 16); smp_f6(smp->tfm_cmac, smp->mackey, smp->prnd, smp->rrnd, r, io_cap, local_addr, remote_addr, check.e); smp_send_cmd(smp->conn, SMP_CMD_DHKEY_CHECK, sizeof(check), &check); } static u8 sc_passkey_send_confirm(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; struct smp_cmd_pairing_confirm cfm; u8 r; r = ((hcon->passkey_notify >> smp->passkey_round) & 0x01); r |= 0x80; get_random_bytes(smp->prnd, sizeof(smp->prnd)); if (smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd, r, cfm.confirm_val)) return SMP_UNSPECIFIED; smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cfm), &cfm); return 0; } static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; u8 cfm[16], r; /* Ignore the PDU if we've already done 20 rounds (0 - 19) */ if (smp->passkey_round >= 20) return 0; switch (smp_op) { case SMP_CMD_PAIRING_RANDOM: r = ((hcon->passkey_notify >> smp->passkey_round) & 0x01); r |= 0x80; if (smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk, smp->rrnd, r, cfm)) return SMP_UNSPECIFIED; if (crypto_memneq(smp->pcnf, cfm, 16)) return SMP_CONFIRM_FAILED; smp->passkey_round++; if (smp->passkey_round == 20) { /* Generate MacKey and LTK */ if (sc_mackey_and_ltk(smp, smp->mackey, smp->tk)) return SMP_UNSPECIFIED; } /* The round is only complete when the initiator * receives pairing random. */ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); if (smp->passkey_round == 20) SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); else SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); return 0; } /* Start the next round */ if (smp->passkey_round != 20) return sc_passkey_round(smp, 0); /* Passkey rounds are complete - start DHKey Check */ sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); break; case SMP_CMD_PAIRING_CONFIRM: if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) { set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); return 0; } SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); return 0; } return sc_passkey_send_confirm(smp); case SMP_CMD_PUBLIC_KEY: default: /* Initiating device starts the round */ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return 0; bt_dev_dbg(hdev, "Starting passkey round %u", smp->passkey_round + 1); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); return sc_passkey_send_confirm(smp); } return 0; } static int sc_user_reply(struct smp_chan *smp, u16 mgmt_op, __le32 passkey) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; u8 smp_op; clear_bit(SMP_FLAG_WAIT_USER, &smp->flags); switch (mgmt_op) { case MGMT_OP_USER_PASSKEY_NEG_REPLY: smp_failure(smp->conn, SMP_PASSKEY_ENTRY_FAILED); return 0; case MGMT_OP_USER_CONFIRM_NEG_REPLY: smp_failure(smp->conn, SMP_NUMERIC_COMP_FAILED); return 0; case MGMT_OP_USER_PASSKEY_REPLY: hcon->passkey_notify = le32_to_cpu(passkey); smp->passkey_round = 0; if (test_and_clear_bit(SMP_FLAG_CFM_PENDING, &smp->flags)) smp_op = SMP_CMD_PAIRING_CONFIRM; else smp_op = 0; if (sc_passkey_round(smp, smp_op)) return -EIO; return 0; } /* Initiator sends DHKey check first */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); } else if (test_and_clear_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags)) { sc_dhkey_check(smp); sc_add_ltk(smp); } return 0; } int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) { struct l2cap_conn *conn = hcon->l2cap_data; struct l2cap_chan *chan; struct smp_chan *smp; u32 value; int err; if (!conn) return -ENOTCONN; bt_dev_dbg(conn->hcon->hdev, ""); chan = conn->smp; if (!chan) return -ENOTCONN; l2cap_chan_lock(chan); if (!chan->data) { err = -ENOTCONN; goto unlock; } smp = chan->data; if (test_bit(SMP_FLAG_SC, &smp->flags)) { err = sc_user_reply(smp, mgmt_op, passkey); goto unlock; } switch (mgmt_op) { case MGMT_OP_USER_PASSKEY_REPLY: value = le32_to_cpu(passkey); memset(smp->tk, 0, sizeof(smp->tk)); bt_dev_dbg(conn->hcon->hdev, "PassKey: %u", value); put_unaligned_le32(value, smp->tk); fallthrough; case MGMT_OP_USER_CONFIRM_REPLY: set_bit(SMP_FLAG_TK_VALID, &smp->flags); break; case MGMT_OP_USER_PASSKEY_NEG_REPLY: case MGMT_OP_USER_CONFIRM_NEG_REPLY: smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); err = 0; goto unlock; default: smp_failure(conn, SMP_PASSKEY_ENTRY_FAILED); err = -EOPNOTSUPP; goto unlock; } err = 0; /* If it is our turn to send Pairing Confirm, do so now */ if (test_bit(SMP_FLAG_CFM_PENDING, &smp->flags)) { u8 rsp = smp_confirm(smp); if (rsp) smp_failure(conn, rsp); } unlock: l2cap_chan_unlock(chan); return err; } static void build_bredr_pairing_cmd(struct smp_chan *smp, struct smp_cmd_pairing *req, struct smp_cmd_pairing *rsp) { struct l2cap_conn *conn = smp->conn; struct hci_dev *hdev = conn->hcon->hdev; u8 local_dist = 0, remote_dist = 0; if (hci_dev_test_flag(hdev, HCI_BONDABLE)) { local_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; remote_dist = SMP_DIST_ENC_KEY | SMP_DIST_SIGN; } if (hci_dev_test_flag(hdev, HCI_RPA_RESOLVING)) remote_dist |= SMP_DIST_ID_KEY; if (hci_dev_test_flag(hdev, HCI_PRIVACY)) local_dist |= SMP_DIST_ID_KEY; if (!rsp) { memset(req, 0, sizeof(*req)); req->auth_req = SMP_AUTH_CT2; req->init_key_dist = local_dist; req->resp_key_dist = remote_dist; req->max_key_size = conn->hcon->enc_key_size; smp->remote_key_dist = remote_dist; return; } memset(rsp, 0, sizeof(*rsp)); rsp->auth_req = SMP_AUTH_CT2; rsp->max_key_size = conn->hcon->enc_key_size; rsp->init_key_dist = req->init_key_dist & remote_dist; rsp->resp_key_dist = req->resp_key_dist & local_dist; smp->remote_key_dist = rsp->init_key_dist; } static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing rsp, *req = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp = chan->data; u8 key_size, auth, sec_level; int ret; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*req)) return SMP_INVALID_PARAMS; if (smp && test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_CMD_NOTSUPP; if (!smp) { smp = smp_chan_create(conn); if (!smp) return SMP_UNSPECIFIED; } /* We didn't start the pairing, so match remote */ auth = req->auth_req & AUTH_REQ_MASK(hdev); if (!hci_dev_test_flag(hdev, HCI_BONDABLE) && (auth & SMP_AUTH_BONDING)) return SMP_PAIRING_NOTSUPP; if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC)) return SMP_AUTH_REQUIREMENTS; smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], req, sizeof(*req)); skb_pull(skb, sizeof(*req)); /* If the remote side's OOB flag is set it means it has * successfully received our local OOB data - therefore set the * flag to indicate that local OOB is in use. */ if (req->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob) set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags); /* SMP over BR/EDR requires special treatment */ if (conn->hcon->type == ACL_LINK) { /* We must have a BR/EDR SC link */ if (!test_bit(HCI_CONN_AES_CCM, &conn->hcon->flags) && !hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return SMP_CROSS_TRANSP_NOT_ALLOWED; set_bit(SMP_FLAG_SC, &smp->flags); build_bredr_pairing_cmd(smp, req, &rsp); if (req->auth_req & SMP_AUTH_CT2) set_bit(SMP_FLAG_CT2, &smp->flags); key_size = min(req->max_key_size, rsp.max_key_size); if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; /* Clear bits which are generated but not distributed */ smp->remote_key_dist &= ~SMP_SC_NO_DIST; smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], &rsp, sizeof(rsp)); smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp); smp_distribute_keys(smp); return 0; } build_pairing_cmd(conn, req, &rsp, auth); if (rsp.auth_req & SMP_AUTH_SC) { set_bit(SMP_FLAG_SC, &smp->flags); if (rsp.auth_req & SMP_AUTH_CT2) set_bit(SMP_FLAG_CT2, &smp->flags); } if (conn->hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) sec_level = BT_SECURITY_MEDIUM; else sec_level = authreq_to_seclevel(auth); if (sec_level > conn->hcon->pending_sec_level) conn->hcon->pending_sec_level = sec_level; /* If we need MITM check that it can be achieved */ if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) { u8 method; method = get_auth_method(smp, conn->hcon->io_capability, req->io_capability); if (method == JUST_WORKS || method == JUST_CFM) return SMP_AUTH_REQUIREMENTS; } key_size = min(req->max_key_size, rsp.max_key_size); if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; get_random_bytes(smp->prnd, sizeof(smp->prnd)); smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], &rsp, sizeof(rsp)); smp_send_cmd(conn, SMP_CMD_PAIRING_RSP, sizeof(rsp), &rsp); clear_bit(SMP_FLAG_INITIATOR, &smp->flags); /* Strictly speaking we shouldn't allow Pairing Confirm for the * SC case, however some implementations incorrectly copy RFU auth * req bits from our security request, which may create a false * positive SC enablement. */ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); if (test_bit(SMP_FLAG_SC, &smp->flags)) { SMP_ALLOW_CMD(smp, SMP_CMD_PUBLIC_KEY); /* Clear bits which are generated but not distributed */ smp->remote_key_dist &= ~SMP_SC_NO_DIST; /* Wait for Public Key from Initiating Device */ return 0; } /* Request setup of TK */ ret = tk_request(conn, 0, auth, rsp.io_capability, req->io_capability); if (ret) return SMP_UNSPECIFIED; return 0; } static u8 sc_send_public_key(struct smp_chan *smp) { struct hci_dev *hdev = smp->conn->hcon->hdev; bt_dev_dbg(hdev, ""); if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) { struct l2cap_chan *chan = hdev->smp_data; struct smp_dev *smp_dev; if (!chan || !chan->data) return SMP_UNSPECIFIED; smp_dev = chan->data; memcpy(smp->local_pk, smp_dev->local_pk, 64); memcpy(smp->lr, smp_dev->local_rand, 16); if (smp_dev->debug_key) set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); goto done; } if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { bt_dev_dbg(hdev, "Using debug keys"); if (set_ecdh_privkey(smp->tfm_ecdh, debug_sk)) return SMP_UNSPECIFIED; memcpy(smp->local_pk, debug_pk, 64); set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); } else { while (true) { /* Generate key pair for Secure Connections */ if (generate_ecdh_keys(smp->tfm_ecdh, smp->local_pk)) return SMP_UNSPECIFIED; /* This is unlikely, but we need to check that * we didn't accidentally generate a debug key. */ if (crypto_memneq(smp->local_pk, debug_pk, 64)) break; } } done: SMP_DBG("Local Public Key X: %32phN", smp->local_pk); SMP_DBG("Local Public Key Y: %32phN", smp->local_pk + 32); smp_send_cmd(smp->conn, SMP_CMD_PUBLIC_KEY, 64, smp->local_pk); return 0; } static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_pairing *req, *rsp = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; u8 key_size, auth; int ret; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*rsp)) return SMP_INVALID_PARAMS; if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_CMD_NOTSUPP; skb_pull(skb, sizeof(*rsp)); req = (void *) &smp->preq[1]; key_size = min(req->max_key_size, rsp->max_key_size); if (check_enc_key_size(conn, key_size)) return SMP_ENC_KEY_SIZE; auth = rsp->auth_req & AUTH_REQ_MASK(hdev); if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC)) return SMP_AUTH_REQUIREMENTS; /* If the remote side's OOB flag is set it means it has * successfully received our local OOB data - therefore set the * flag to indicate that local OOB is in use. */ if (rsp->oob_flag == SMP_OOB_PRESENT && SMP_DEV(hdev)->local_oob) set_bit(SMP_FLAG_LOCAL_OOB, &smp->flags); smp->prsp[0] = SMP_CMD_PAIRING_RSP; memcpy(&smp->prsp[1], rsp, sizeof(*rsp)); /* Update remote key distribution in case the remote cleared * some bits that we had enabled in our request. */ smp->remote_key_dist &= rsp->resp_key_dist; if ((req->auth_req & SMP_AUTH_CT2) && (auth & SMP_AUTH_CT2)) set_bit(SMP_FLAG_CT2, &smp->flags); /* For BR/EDR this means we're done and can start phase 3 */ if (conn->hcon->type == ACL_LINK) { /* Clear bits which are generated but not distributed */ smp->remote_key_dist &= ~SMP_SC_NO_DIST; smp_distribute_keys(smp); return 0; } if ((req->auth_req & SMP_AUTH_SC) && (auth & SMP_AUTH_SC)) set_bit(SMP_FLAG_SC, &smp->flags); else if (conn->hcon->pending_sec_level > BT_SECURITY_HIGH) conn->hcon->pending_sec_level = BT_SECURITY_HIGH; /* If we need MITM check that it can be achieved */ if (conn->hcon->pending_sec_level >= BT_SECURITY_HIGH) { u8 method; method = get_auth_method(smp, req->io_capability, rsp->io_capability); if (method == JUST_WORKS || method == JUST_CFM) return SMP_AUTH_REQUIREMENTS; } get_random_bytes(smp->prnd, sizeof(smp->prnd)); /* Update remote key distribution in case the remote cleared * some bits that we had enabled in our request. */ smp->remote_key_dist &= rsp->resp_key_dist; if (test_bit(SMP_FLAG_SC, &smp->flags)) { /* Clear bits which are generated but not distributed */ smp->remote_key_dist &= ~SMP_SC_NO_DIST; SMP_ALLOW_CMD(smp, SMP_CMD_PUBLIC_KEY); return sc_send_public_key(smp); } auth |= req->auth_req; ret = tk_request(conn, 0, auth, req->io_capability, rsp->io_capability); if (ret) return SMP_UNSPECIFIED; set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); /* Can't compose response until we have been confirmed */ if (test_bit(SMP_FLAG_TK_VALID, &smp->flags)) return smp_confirm(smp); return 0; } static u8 sc_check_confirm(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; bt_dev_dbg(conn->hcon->hdev, ""); if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) return sc_passkey_round(smp, SMP_CMD_PAIRING_CONFIRM); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); } return 0; } /* Work-around for some implementations that incorrectly copy RFU bits * from our security request and thereby create the impression that * we're doing SC when in fact the remote doesn't support it. */ static int fixup_sc_false_positive(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_cmd_pairing *req, *rsp; u8 auth; /* The issue is only observed when we're in responder role */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_UNSPECIFIED; if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) { bt_dev_err(hdev, "refusing legacy fallback in SC-only mode"); return SMP_UNSPECIFIED; } bt_dev_err(hdev, "trying to fall back to legacy SMP"); req = (void *) &smp->preq[1]; rsp = (void *) &smp->prsp[1]; /* Rebuild key dist flags which may have been cleared for SC */ smp->remote_key_dist = (req->init_key_dist & rsp->resp_key_dist); auth = req->auth_req & AUTH_REQ_MASK(hdev); if (tk_request(conn, 0, auth, rsp->io_capability, req->io_capability)) { bt_dev_err(hdev, "failed to fall back to legacy SMP"); return SMP_UNSPECIFIED; } clear_bit(SMP_FLAG_SC, &smp->flags); return 0; } static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; bt_dev_dbg(hdev, "conn %p %s", conn, test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" : "responder"); if (skb->len < sizeof(smp->pcnf)) return SMP_INVALID_PARAMS; memcpy(smp->pcnf, skb->data, sizeof(smp->pcnf)); skb_pull(skb, sizeof(smp->pcnf)); if (test_bit(SMP_FLAG_SC, &smp->flags)) { int ret; /* Public Key exchange must happen before any other steps */ if (test_bit(SMP_FLAG_REMOTE_PK, &smp->flags)) return sc_check_confirm(smp); bt_dev_err(hdev, "Unexpected SMP Pairing Confirm"); ret = fixup_sc_false_positive(smp); if (ret) return ret; } if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); return 0; } if (test_bit(SMP_FLAG_TK_VALID, &smp->flags)) return smp_confirm(smp); set_bit(SMP_FLAG_CFM_PENDING, &smp->flags); return 0; } static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; u8 *pkax, *pkbx, *na, *nb, confirm_hint; u32 passkey; int err; bt_dev_dbg(hcon->hdev, "conn %p", conn); if (skb->len < sizeof(smp->rrnd)) return SMP_INVALID_PARAMS; memcpy(smp->rrnd, skb->data, sizeof(smp->rrnd)); skb_pull(skb, sizeof(smp->rrnd)); if (!test_bit(SMP_FLAG_SC, &smp->flags)) return smp_random(smp); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { pkax = smp->local_pk; pkbx = smp->remote_pk; na = smp->prnd; nb = smp->rrnd; } else { pkax = smp->remote_pk; pkbx = smp->local_pk; na = smp->rrnd; nb = smp->prnd; } if (smp->method == REQ_OOB) { if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); goto mackey_and_ltk; } /* Passkey entry has special treatment */ if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) return sc_passkey_round(smp, SMP_CMD_PAIRING_RANDOM); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { u8 cfm[16]; err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk, smp->rrnd, 0, cfm); if (err) return SMP_UNSPECIFIED; if (crypto_memneq(smp->pcnf, cfm, 16)) return SMP_CONFIRM_FAILED; } else { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); /* Only Just-Works pairing requires extra checks */ if (smp->method != JUST_WORKS) goto mackey_and_ltk; /* If there already exists long term key in local host, leave * the decision to user space since the remote device could * be legitimate or malicious. */ if (hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, hcon->role)) { /* Set passkey to 0. The value can be any number since * it'll be ignored anyway. */ passkey = 0; confirm_hint = 1; goto confirm; } } mackey_and_ltk: /* Generate MacKey and LTK */ err = sc_mackey_and_ltk(smp, smp->mackey, smp->tk); if (err) return SMP_UNSPECIFIED; if (smp->method == REQ_OOB) { if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); } return 0; } err = smp_g2(smp->tfm_cmac, pkax, pkbx, na, nb, &passkey); if (err) return SMP_UNSPECIFIED; confirm_hint = 0; confirm: if (smp->method == JUST_WORKS) confirm_hint = 1; err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, confirm_hint); if (err) return SMP_UNSPECIFIED; set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level) { struct smp_ltk *key; struct hci_conn *hcon = conn->hcon; key = hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, hcon->role); if (!key) return false; if (smp_ltk_sec_level(key) < sec_level) return false; if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags)) return true; hci_le_start_enc(hcon, key->ediv, key->rand, key->val, key->enc_size); hcon->enc_key_size = key->enc_size; /* We never store STKs for initiator role, so clear this flag */ clear_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags); return true; } bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level, enum smp_key_pref key_pref) { if (sec_level == BT_SECURITY_LOW) return true; /* If we're encrypted with an STK but the caller prefers using * LTK claim insufficient security. This way we allow the * connection to be re-encrypted with an LTK, even if the LTK * provides the same level of security. Only exception is if we * don't have an LTK (e.g. because of key distribution bits). */ if (key_pref == SMP_USE_LTK && test_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags) && hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, hcon->role)) return false; if (hcon->sec_level >= sec_level) return true; return false; } static void smp_send_pairing_req(struct smp_chan *smp, __u8 auth) { struct smp_cmd_pairing cp; if (smp->conn->hcon->type == ACL_LINK) build_bredr_pairing_cmd(smp, &cp, NULL); else build_pairing_cmd(smp->conn, &cp, NULL, auth); smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], &cp, sizeof(cp)); smp_send_cmd(smp->conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); set_bit(SMP_FLAG_INITIATOR, &smp->flags); } static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_security_req *rp = (void *) skb->data; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_chan *smp; u8 sec_level, auth; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; if (hcon->role != HCI_ROLE_MASTER) return SMP_CMD_NOTSUPP; auth = rp->auth_req & AUTH_REQ_MASK(hdev); if (hci_dev_test_flag(hdev, HCI_SC_ONLY) && !(auth & SMP_AUTH_SC)) return SMP_AUTH_REQUIREMENTS; if (hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) sec_level = BT_SECURITY_MEDIUM; else sec_level = authreq_to_seclevel(auth); if (smp_sufficient_security(hcon, sec_level, SMP_USE_LTK)) { /* If link is already encrypted with sufficient security we * still need refresh encryption as per Core Spec 5.0 Vol 3, * Part H 2.4.6 */ smp_ltk_encrypt(conn, hcon->sec_level); return 0; } if (sec_level > hcon->pending_sec_level) hcon->pending_sec_level = sec_level; if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; smp = smp_chan_create(conn); if (!smp) return SMP_UNSPECIFIED; if (!hci_dev_test_flag(hdev, HCI_BONDABLE) && (auth & SMP_AUTH_BONDING)) return SMP_PAIRING_NOTSUPP; skb_pull(skb, sizeof(*rp)); smp_send_pairing_req(smp, auth); return 0; } static void smp_send_security_req(struct smp_chan *smp, __u8 auth) { struct smp_cmd_security_req cp; cp.auth_req = auth; smp_send_cmd(smp->conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ); clear_bit(SMP_FLAG_INITIATOR, &smp->flags); } int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) { struct l2cap_conn *conn = hcon->l2cap_data; struct l2cap_chan *chan; struct smp_chan *smp; __u8 authreq; int ret; bt_dev_dbg(hcon->hdev, "conn %p hcon %p level 0x%2.2x", conn, hcon, sec_level); /* This may be NULL if there's an unexpected disconnection */ if (!conn) return 1; if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) return 1; if (smp_sufficient_security(hcon, sec_level, SMP_USE_LTK)) return 1; if (sec_level > hcon->pending_sec_level) hcon->pending_sec_level = sec_level; if (hcon->role == HCI_ROLE_MASTER) if (smp_ltk_encrypt(conn, hcon->pending_sec_level)) return 0; chan = conn->smp; if (!chan) { bt_dev_err(hcon->hdev, "security requested but not available"); return 1; } l2cap_chan_lock(chan); /* If SMP is already in progress ignore this request */ if (chan->data) { ret = 0; goto unlock; } smp = smp_chan_create(conn); if (!smp) { ret = 1; goto unlock; } authreq = seclevel_to_authreq(sec_level); if (hci_dev_test_flag(hcon->hdev, HCI_SC_ENABLED)) { authreq |= SMP_AUTH_SC; if (hci_dev_test_flag(hcon->hdev, HCI_SSP_ENABLED)) authreq |= SMP_AUTH_CT2; } /* Don't attempt to set MITM if setting is overridden by debugfs * Needed to pass certification test SM/MAS/PKE/BV-01-C */ if (!hci_dev_test_flag(hcon->hdev, HCI_FORCE_NO_MITM)) { /* Require MITM if IO Capability allows or the security level * requires it. */ if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT || hcon->pending_sec_level > BT_SECURITY_MEDIUM) authreq |= SMP_AUTH_MITM; } if (hcon->role == HCI_ROLE_MASTER) smp_send_pairing_req(smp, authreq); else smp_send_security_req(smp, authreq); ret = 0; unlock: l2cap_chan_unlock(chan); return ret; } int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { struct hci_conn *hcon; struct l2cap_conn *conn; struct l2cap_chan *chan; struct smp_chan *smp; int err; err = hci_remove_ltk(hdev, bdaddr, addr_type); hci_remove_irk(hdev, bdaddr, addr_type); hcon = hci_conn_hash_lookup_le(hdev, bdaddr, addr_type); if (!hcon) goto done; conn = hcon->l2cap_data; if (!conn) goto done; chan = conn->smp; if (!chan) goto done; l2cap_chan_lock(chan); smp = chan->data; if (smp) { /* Set keys to NULL to make sure smp_failure() does not try to * remove and free already invalidated rcu list entries. */ smp->ltk = NULL; smp->responder_ltk = NULL; smp->remote_irk = NULL; if (test_bit(SMP_FLAG_COMPLETE, &smp->flags)) smp_failure(conn, 0); else smp_failure(conn, SMP_UNSPECIFIED); err = 0; } l2cap_chan_unlock(chan); done: return err; } static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_encrypt_info *rp = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; /* Pairing is aborted if any blocked keys are distributed */ if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_LTK, rp->ltk)) { bt_dev_warn_ratelimited(conn->hcon->hdev, "LTK blocked for %pMR", &conn->hcon->dst); return SMP_INVALID_PARAMS; } SMP_ALLOW_CMD(smp, SMP_CMD_INITIATOR_IDENT); skb_pull(skb, sizeof(*rp)); memcpy(smp->tk, rp->ltk, sizeof(smp->tk)); return 0; } static int smp_cmd_initiator_ident(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_initiator_ident *rp = (void *)skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_dev *hdev = conn->hcon->hdev; struct hci_conn *hcon = conn->hcon; struct smp_ltk *ltk; u8 authenticated; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_ENC_KEY; if (smp->remote_key_dist & SMP_DIST_ID_KEY) SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_INFO); else if (smp->remote_key_dist & SMP_DIST_SIGN) SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); skb_pull(skb, sizeof(*rp)); authenticated = (hcon->sec_level == BT_SECURITY_HIGH); ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type, SMP_LTK, authenticated, smp->tk, smp->enc_key_size, rp->ediv, rp->rand); smp->ltk = ltk; if (!(smp->remote_key_dist & KEY_DIST_MASK)) smp_distribute_keys(smp); return 0; } static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_ident_info *info = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; bt_dev_dbg(conn->hcon->hdev, ""); if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; /* Pairing is aborted if any blocked keys are distributed */ if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_IRK, info->irk)) { bt_dev_warn_ratelimited(conn->hcon->hdev, "Identity key blocked for %pMR", &conn->hcon->dst); return SMP_INVALID_PARAMS; } SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO); skb_pull(skb, sizeof(*info)); memcpy(smp->irk, info->irk, 16); return 0; } static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_ident_addr_info *info = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; bdaddr_t rpa; bt_dev_dbg(hcon->hdev, ""); if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_ID_KEY; if (smp->remote_key_dist & SMP_DIST_SIGN) SMP_ALLOW_CMD(smp, SMP_CMD_SIGN_INFO); skb_pull(skb, sizeof(*info)); /* Strictly speaking the Core Specification (4.1) allows sending * an empty address which would force us to rely on just the IRK * as "identity information". However, since such * implementations are not known of and in order to not over * complicate our implementation, simply pretend that we never * received an IRK for such a device. * * The Identity Address must also be a Static Random or Public * Address, which hci_is_identity_address() checks for. */ if (!bacmp(&info->bdaddr, BDADDR_ANY) || !hci_is_identity_address(&info->bdaddr, info->addr_type)) { bt_dev_err(hcon->hdev, "ignoring IRK with no identity address"); goto distribute; } /* Drop IRK if peer is using identity address during pairing but is * providing different address as identity information. * * Microsoft Surface Precision Mouse is known to have this bug. */ if (hci_is_identity_address(&hcon->dst, hcon->dst_type) && (bacmp(&info->bdaddr, &hcon->dst) || info->addr_type != hcon->dst_type)) { bt_dev_err(hcon->hdev, "ignoring IRK with invalid identity address"); goto distribute; } bacpy(&smp->id_addr, &info->bdaddr); smp->id_addr_type = info->addr_type; if (hci_bdaddr_is_rpa(&hcon->dst, hcon->dst_type)) bacpy(&rpa, &hcon->dst); else bacpy(&rpa, BDADDR_ANY); smp->remote_irk = hci_add_irk(conn->hcon->hdev, &smp->id_addr, smp->id_addr_type, smp->irk, &rpa); distribute: if (!(smp->remote_key_dist & KEY_DIST_MASK)) smp_distribute_keys(smp); return 0; } static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_sign_info *rp = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct smp_csrk *csrk; bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; /* Mark the information as received */ smp->remote_key_dist &= ~SMP_DIST_SIGN; skb_pull(skb, sizeof(*rp)); csrk = kzalloc(sizeof(*csrk), GFP_KERNEL); if (csrk) { if (conn->hcon->sec_level > BT_SECURITY_MEDIUM) csrk->type = MGMT_CSRK_REMOTE_AUTHENTICATED; else csrk->type = MGMT_CSRK_REMOTE_UNAUTHENTICATED; memcpy(csrk->val, rp->csrk, sizeof(csrk->val)); } smp->csrk = csrk; smp_distribute_keys(smp); return 0; } static u8 sc_select_method(struct smp_chan *smp) { struct smp_cmd_pairing *local, *remote; u8 local_mitm, remote_mitm, local_io, remote_io, method; if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags) || test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) return REQ_OOB; /* The preq/prsp contain the raw Pairing Request/Response PDUs * which are needed as inputs to some crypto functions. To get * the "struct smp_cmd_pairing" from them we need to skip the * first byte which contains the opcode. */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local = (void *) &smp->preq[1]; remote = (void *) &smp->prsp[1]; } else { local = (void *) &smp->prsp[1]; remote = (void *) &smp->preq[1]; } local_io = local->io_capability; remote_io = remote->io_capability; local_mitm = (local->auth_req & SMP_AUTH_MITM); remote_mitm = (remote->auth_req & SMP_AUTH_MITM); /* If either side wants MITM, look up the method from the table, * otherwise use JUST WORKS. */ if (local_mitm || remote_mitm) method = get_auth_method(smp, local_io, remote_io); else method = JUST_WORKS; /* Don't confirm locally initiated pairing attempts */ if (method == JUST_CFM && test_bit(SMP_FLAG_INITIATOR, &smp->flags)) method = JUST_WORKS; return method; } static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_public_key *key = (void *) skb->data; struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; struct hci_dev *hdev = hcon->hdev; struct crypto_kpp *tfm_ecdh; struct smp_cmd_pairing_confirm cfm; int err; bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*key)) return SMP_INVALID_PARAMS; /* Check if remote and local public keys are the same and debug key is * not in use. */ if (!test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags) && !crypto_memneq(key, smp->local_pk, 64)) { bt_dev_err(hdev, "Remote and local public keys are identical"); return SMP_UNSPECIFIED; } memcpy(smp->remote_pk, key, 64); if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags)) { err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->remote_pk, smp->rr, 0, cfm.confirm_val); if (err) return SMP_UNSPECIFIED; if (crypto_memneq(cfm.confirm_val, smp->pcnf, 16)) return SMP_CONFIRM_FAILED; } /* Non-initiating device sends its public key after receiving * the key from the initiating device. */ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { err = sc_send_public_key(smp); if (err) return err; } SMP_DBG("Remote Public Key X: %32phN", smp->remote_pk); SMP_DBG("Remote Public Key Y: %32phN", smp->remote_pk + 32); /* Compute the shared secret on the same crypto tfm on which the private * key was set/generated. */ if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) { struct l2cap_chan *hchan = hdev->smp_data; struct smp_dev *smp_dev; if (!hchan || !hchan->data) return SMP_UNSPECIFIED; smp_dev = hchan->data; tfm_ecdh = smp_dev->tfm_ecdh; } else { tfm_ecdh = smp->tfm_ecdh; } if (compute_ecdh_secret(tfm_ecdh, smp->remote_pk, smp->dhkey)) return SMP_UNSPECIFIED; SMP_DBG("DHKey %32phN", smp->dhkey); set_bit(SMP_FLAG_REMOTE_PK, &smp->flags); smp->method = sc_select_method(smp); bt_dev_dbg(hdev, "selected method 0x%02x", smp->method); /* JUST_WORKS and JUST_CFM result in an unauthenticated key */ if (smp->method == JUST_WORKS || smp->method == JUST_CFM) hcon->pending_sec_level = BT_SECURITY_MEDIUM; else hcon->pending_sec_level = BT_SECURITY_FIPS; if (!crypto_memneq(debug_pk, smp->remote_pk, 64)) set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); if (smp->method == DSP_PASSKEY) { get_random_bytes(&hcon->passkey_notify, sizeof(hcon->passkey_notify)); hcon->passkey_notify %= 1000000; hcon->passkey_entered = 0; smp->passkey_round = 0; if (mgmt_user_passkey_notify(hdev, &hcon->dst, hcon->type, hcon->dst_type, hcon->passkey_notify, hcon->passkey_entered)) return SMP_UNSPECIFIED; SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); return sc_passkey_round(smp, SMP_CMD_PUBLIC_KEY); } if (smp->method == REQ_OOB) { if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); return 0; } if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); if (smp->method == REQ_PASSKEY) { if (mgmt_user_passkey_request(hdev, &hcon->dst, hcon->type, hcon->dst_type)) return SMP_UNSPECIFIED; SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } /* The Initiating device waits for the non-initiating device to * send the confirm value. */ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return 0; err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd, 0, cfm.confirm_val); if (err) return SMP_UNSPECIFIED; smp_send_cmd(conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cfm), &cfm); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); return 0; } static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_dhkey_check *check = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct hci_conn *hcon = conn->hcon; struct smp_chan *smp = chan->data; u8 a[7], b[7], *local_addr, *remote_addr; u8 io_cap[3], r[16], e[16]; int err; bt_dev_dbg(hcon->hdev, "conn %p", conn); if (skb->len < sizeof(*check)) return SMP_INVALID_PARAMS; memcpy(a, &hcon->init_addr, 6); memcpy(b, &hcon->resp_addr, 6); a[6] = hcon->init_addr_type; b[6] = hcon->resp_addr_type; if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local_addr = a; remote_addr = b; memcpy(io_cap, &smp->prsp[1], 3); } else { local_addr = b; remote_addr = a; memcpy(io_cap, &smp->preq[1], 3); } memset(r, 0, sizeof(r)); if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) put_unaligned_le32(hcon->passkey_notify, r); else if (smp->method == REQ_OOB) memcpy(r, smp->lr, 16); err = smp_f6(smp->tfm_cmac, smp->mackey, smp->rrnd, smp->prnd, r, io_cap, remote_addr, local_addr, e); if (err) return SMP_UNSPECIFIED; if (crypto_memneq(check->e, e, 16)) return SMP_DHKEY_CHECK_FAILED; if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) { set_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags); return 0; } /* Responder sends DHKey check as response to initiator */ sc_dhkey_check(smp); } sc_add_ltk(smp); if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { hci_le_start_enc(hcon, 0, 0, smp->tk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; } return 0; } static int smp_cmd_keypress_notify(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_keypress_notify *kp = (void *) skb->data; bt_dev_dbg(conn->hcon->hdev, "value 0x%02x", kp->value); return 0; } static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) { struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; struct smp_chan *smp; __u8 code, reason; int err = 0; if (skb->len < 1) return -EILSEQ; if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED)) { reason = SMP_PAIRING_NOTSUPP; goto done; } code = skb->data[0]; skb_pull(skb, sizeof(code)); smp = chan->data; if (code > SMP_CMD_MAX) goto drop; if (smp && !test_and_clear_bit(code, &smp->allow_cmd)) goto drop; /* If we don't have a context the only allowed commands are * pairing request and security request. */ if (!smp && code != SMP_CMD_PAIRING_REQ && code != SMP_CMD_SECURITY_REQ) goto drop; switch (code) { case SMP_CMD_PAIRING_REQ: reason = smp_cmd_pairing_req(conn, skb); break; case SMP_CMD_PAIRING_FAIL: smp_failure(conn, 0); err = -EPERM; break; case SMP_CMD_PAIRING_RSP: reason = smp_cmd_pairing_rsp(conn, skb); break; case SMP_CMD_SECURITY_REQ: reason = smp_cmd_security_req(conn, skb); break; case SMP_CMD_PAIRING_CONFIRM: reason = smp_cmd_pairing_confirm(conn, skb); break; case SMP_CMD_PAIRING_RANDOM: reason = smp_cmd_pairing_random(conn, skb); break; case SMP_CMD_ENCRYPT_INFO: reason = smp_cmd_encrypt_info(conn, skb); break; case SMP_CMD_INITIATOR_IDENT: reason = smp_cmd_initiator_ident(conn, skb); break; case SMP_CMD_IDENT_INFO: reason = smp_cmd_ident_info(conn, skb); break; case SMP_CMD_IDENT_ADDR_INFO: reason = smp_cmd_ident_addr_info(conn, skb); break; case SMP_CMD_SIGN_INFO: reason = smp_cmd_sign_info(conn, skb); break; case SMP_CMD_PUBLIC_KEY: reason = smp_cmd_public_key(conn, skb); break; case SMP_CMD_DHKEY_CHECK: reason = smp_cmd_dhkey_check(conn, skb); break; case SMP_CMD_KEYPRESS_NOTIFY: reason = smp_cmd_keypress_notify(conn, skb); break; default: bt_dev_dbg(hcon->hdev, "Unknown command code 0x%2.2x", code); reason = SMP_CMD_NOTSUPP; goto done; } done: if (!err) { if (reason) smp_failure(conn, reason); kfree_skb(skb); } return err; drop: bt_dev_err(hcon->hdev, "unexpected SMP command 0x%02x from %pMR", code, &hcon->dst); kfree_skb(skb); return 0; } static void smp_teardown_cb(struct l2cap_chan *chan, int err) { struct l2cap_conn *conn = chan->conn; bt_dev_dbg(conn->hcon->hdev, "chan %p", chan); if (chan->data) smp_chan_destroy(conn); conn->smp = NULL; l2cap_chan_put(chan); } static void bredr_pairing(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_chan *smp; bt_dev_dbg(hdev, "chan %p", chan); /* Only new pairings are interesting */ if (!test_bit(HCI_CONN_NEW_LINK_KEY, &hcon->flags)) return; /* Don't bother if we're not encrypted */ if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) return; /* Only initiator may initiate SMP over BR/EDR */ if (hcon->role != HCI_ROLE_MASTER) return; /* Secure Connections support must be enabled */ if (!hci_dev_test_flag(hdev, HCI_SC_ENABLED)) return; /* BR/EDR must use Secure Connections for SMP */ if (!test_bit(HCI_CONN_AES_CCM, &hcon->flags) && !hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return; /* If our LE support is not enabled don't do anything */ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return; /* Don't bother if remote LE support is not enabled */ if (!lmp_host_le_capable(hcon)) return; /* Remote must support SMP fixed chan for BR/EDR */ if (!(conn->remote_fixed_chan & L2CAP_FC_SMP_BREDR)) return; /* Don't bother if SMP is already ongoing */ if (chan->data) return; smp = smp_chan_create(conn); if (!smp) { bt_dev_err(hdev, "unable to create SMP context for BR/EDR"); return; } set_bit(SMP_FLAG_SC, &smp->flags); bt_dev_dbg(hdev, "starting SMP over BR/EDR"); smp_send_pairing_req(smp, 0x00); } static void smp_resume_cb(struct l2cap_chan *chan) { struct smp_chan *smp = chan->data; struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; bt_dev_dbg(hcon->hdev, "chan %p", chan); if (hcon->type == ACL_LINK) { bredr_pairing(chan); return; } if (!smp) return; if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) return; cancel_delayed_work(&smp->security_timer); smp_distribute_keys(smp); } static void smp_ready_cb(struct l2cap_chan *chan) { struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; bt_dev_dbg(hcon->hdev, "chan %p", chan); /* No need to call l2cap_chan_hold() here since we already own * the reference taken in smp_new_conn_cb(). This is just the * first time that we tie it to a specific pointer. The code in * l2cap_core.c ensures that there's no risk this function wont * get called if smp_new_conn_cb was previously called. */ conn->smp = chan; if (hcon->type == ACL_LINK && test_bit(HCI_CONN_ENCRYPT, &hcon->flags)) bredr_pairing(chan); } static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { int err; bt_dev_dbg(chan->conn->hcon->hdev, "chan %p", chan); err = smp_sig_channel(chan, skb); if (err) { struct smp_chan *smp = chan->data; if (smp) cancel_delayed_work_sync(&smp->security_timer); hci_disconnect(chan->conn->hcon, HCI_ERROR_AUTH_FAILURE); } return err; } static struct sk_buff *smp_alloc_skb_cb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { struct sk_buff *skb; skb = bt_skb_alloc(hdr_len + len, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); skb->priority = HCI_PRIO_MAX; bt_cb(skb)->l2cap.chan = chan; return skb; } static const struct l2cap_ops smp_chan_ops = { .name = "Security Manager", .ready = smp_ready_cb, .recv = smp_recv_cb, .alloc_skb = smp_alloc_skb_cb, .teardown = smp_teardown_cb, .resume = smp_resume_cb, .new_connection = l2cap_chan_no_new_connection, .state_change = l2cap_chan_no_state_change, .close = l2cap_chan_no_close, .defer = l2cap_chan_no_defer, .suspend = l2cap_chan_no_suspend, .set_shutdown = l2cap_chan_no_set_shutdown, .get_sndtimeo = l2cap_chan_no_get_sndtimeo, }; static inline struct l2cap_chan *smp_new_conn_cb(struct l2cap_chan *pchan) { struct l2cap_chan *chan; BT_DBG("pchan %p", pchan); chan = l2cap_chan_create(); if (!chan) return NULL; chan->chan_type = pchan->chan_type; chan->ops = &smp_chan_ops; chan->scid = pchan->scid; chan->dcid = chan->scid; chan->imtu = pchan->imtu; chan->omtu = pchan->omtu; chan->mode = pchan->mode; /* Other L2CAP channels may request SMP routines in order to * change the security level. This means that the SMP channel * lock must be considered in its own category to avoid lockdep * warnings. */ atomic_set(&chan->nesting, L2CAP_NESTING_SMP); BT_DBG("created chan %p", chan); return chan; } static const struct l2cap_ops smp_root_chan_ops = { .name = "Security Manager Root", .new_connection = smp_new_conn_cb, /* None of these are implemented for the root channel */ .close = l2cap_chan_no_close, .alloc_skb = l2cap_chan_no_alloc_skb, .recv = l2cap_chan_no_recv, .state_change = l2cap_chan_no_state_change, .teardown = l2cap_chan_no_teardown, .ready = l2cap_chan_no_ready, .defer = l2cap_chan_no_defer, .suspend = l2cap_chan_no_suspend, .resume = l2cap_chan_no_resume, .set_shutdown = l2cap_chan_no_set_shutdown, .get_sndtimeo = l2cap_chan_no_get_sndtimeo, }; static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) { struct l2cap_chan *chan; struct smp_dev *smp; struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; if (cid == L2CAP_CID_SMP_BREDR) { smp = NULL; goto create_chan; } smp = kzalloc(sizeof(*smp), GFP_KERNEL); if (!smp) return ERR_PTR(-ENOMEM); tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { bt_dev_err(hdev, "Unable to create CMAC crypto context"); kfree_sensitive(smp); return ERR_CAST(tfm_cmac); } tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0); if (IS_ERR(tfm_ecdh)) { bt_dev_err(hdev, "Unable to create ECDH crypto context"); crypto_free_shash(tfm_cmac); kfree_sensitive(smp); return ERR_CAST(tfm_ecdh); } smp->local_oob = false; smp->tfm_cmac = tfm_cmac; smp->tfm_ecdh = tfm_ecdh; create_chan: chan = l2cap_chan_create(); if (!chan) { if (smp) { crypto_free_shash(smp->tfm_cmac); crypto_free_kpp(smp->tfm_ecdh); kfree_sensitive(smp); } return ERR_PTR(-ENOMEM); } chan->data = smp; l2cap_add_scid(chan, cid); l2cap_chan_set_defaults(chan); if (cid == L2CAP_CID_SMP) { u8 bdaddr_type; hci_copy_identity_address(hdev, &chan->src, &bdaddr_type); if (bdaddr_type == ADDR_LE_DEV_PUBLIC) chan->src_type = BDADDR_LE_PUBLIC; else chan->src_type = BDADDR_LE_RANDOM; } else { bacpy(&chan->src, &hdev->bdaddr); chan->src_type = BDADDR_BREDR; } chan->state = BT_LISTEN; chan->mode = L2CAP_MODE_BASIC; chan->imtu = L2CAP_DEFAULT_MTU; chan->ops = &smp_root_chan_ops; /* Set correct nesting level for a parent/listening channel */ atomic_set(&chan->nesting, L2CAP_NESTING_PARENT); return chan; } static void smp_del_chan(struct l2cap_chan *chan) { struct smp_dev *smp; BT_DBG("chan %p", chan); smp = chan->data; if (smp) { chan->data = NULL; crypto_free_shash(smp->tfm_cmac); crypto_free_kpp(smp->tfm_ecdh); kfree_sensitive(smp); } l2cap_chan_put(chan); } int smp_force_bredr(struct hci_dev *hdev, bool enable) { if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return -EALREADY; if (enable) { struct l2cap_chan *chan; chan = smp_add_cid(hdev, L2CAP_CID_SMP_BREDR); if (IS_ERR(chan)) return PTR_ERR(chan); hdev->smp_bredr_data = chan; } else { struct l2cap_chan *chan; chan = hdev->smp_bredr_data; hdev->smp_bredr_data = NULL; smp_del_chan(chan); } hci_dev_change_flag(hdev, HCI_FORCE_BREDR_SMP); return 0; } int smp_register(struct hci_dev *hdev) { struct l2cap_chan *chan; bt_dev_dbg(hdev, ""); /* If the controller does not support Low Energy operation, then * there is also no need to register any SMP channel. */ if (!lmp_le_capable(hdev)) return 0; if (WARN_ON(hdev->smp_data)) { chan = hdev->smp_data; hdev->smp_data = NULL; smp_del_chan(chan); } chan = smp_add_cid(hdev, L2CAP_CID_SMP); if (IS_ERR(chan)) return PTR_ERR(chan); hdev->smp_data = chan; if (!lmp_sc_capable(hdev)) { /* Flag can be already set here (due to power toggle) */ if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) return 0; } if (WARN_ON(hdev->smp_bredr_data)) { chan = hdev->smp_bredr_data; hdev->smp_bredr_data = NULL; smp_del_chan(chan); } chan = smp_add_cid(hdev, L2CAP_CID_SMP_BREDR); if (IS_ERR(chan)) { int err = PTR_ERR(chan); chan = hdev->smp_data; hdev->smp_data = NULL; smp_del_chan(chan); return err; } hdev->smp_bredr_data = chan; return 0; } void smp_unregister(struct hci_dev *hdev) { struct l2cap_chan *chan; if (hdev->smp_bredr_data) { chan = hdev->smp_bredr_data; hdev->smp_bredr_data = NULL; smp_del_chan(chan); } if (hdev->smp_data) { chan = hdev->smp_data; hdev->smp_data = NULL; smp_del_chan(chan); } } #if IS_ENABLED(CONFIG_BT_SELFTEST_SMP) static int __init test_debug_key(struct crypto_kpp *tfm_ecdh) { u8 pk[64]; int err; err = set_ecdh_privkey(tfm_ecdh, debug_sk); if (err) return err; err = generate_ecdh_public_key(tfm_ecdh, pk); if (err) return err; if (crypto_memneq(pk, debug_pk, 64)) return -EINVAL; return 0; } static int __init test_ah(void) { const u8 irk[16] = { 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec }; const u8 r[3] = { 0x94, 0x81, 0x70 }; const u8 exp[3] = { 0xaa, 0xfb, 0x0d }; u8 res[3]; int err; err = smp_ah(irk, r, res); if (err) return err; if (crypto_memneq(res, exp, 3)) return -EINVAL; return 0; } static int __init test_c1(void) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; const u8 r[16] = { 0xe0, 0x2e, 0x70, 0xc6, 0x4e, 0x27, 0x88, 0x63, 0x0e, 0x6f, 0xad, 0x56, 0x21, 0xd5, 0x83, 0x57 }; const u8 preq[7] = { 0x01, 0x01, 0x00, 0x00, 0x10, 0x07, 0x07 }; const u8 pres[7] = { 0x02, 0x03, 0x00, 0x00, 0x08, 0x00, 0x05 }; const u8 _iat = 0x01; const u8 _rat = 0x00; const bdaddr_t ra = { { 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1 } }; const bdaddr_t ia = { { 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1 } }; const u8 exp[16] = { 0x86, 0x3b, 0xf1, 0xbe, 0xc5, 0x4d, 0xa7, 0xd2, 0xea, 0x88, 0x89, 0x87, 0xef, 0x3f, 0x1e, 0x1e }; u8 res[16]; int err; err = smp_c1(k, r, preq, pres, _iat, &ia, _rat, &ra, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static int __init test_s1(void) { const u8 k[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; const u8 r1[16] = { 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11 }; const u8 r2[16] = { 0x00, 0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99 }; const u8 exp[16] = { 0x62, 0xa0, 0x6d, 0x79, 0xae, 0x16, 0x42, 0x5b, 0x9b, 0xf4, 0xb0, 0xe8, 0xf0, 0xe1, 0x1f, 0x9a }; u8 res[16]; int err; err = smp_s1(k, r1, r2, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static int __init test_f4(struct crypto_shash *tfm_cmac) { const u8 u[32] = { 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc, 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef, 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e, 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20 }; const u8 v[32] = { 0xfd, 0xc5, 0x7f, 0xf4, 0x49, 0xdd, 0x4f, 0x6b, 0xfb, 0x7c, 0x9d, 0xf1, 0xc2, 0x9a, 0xcb, 0x59, 0x2a, 0xe7, 0xd4, 0xee, 0xfb, 0xfc, 0x0a, 0x90, 0x9a, 0xbb, 0xf6, 0x32, 0x3d, 0x8b, 0x18, 0x55 }; const u8 x[16] = { 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff, 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 }; const u8 z = 0x00; const u8 exp[16] = { 0x2d, 0x87, 0x74, 0xa9, 0xbe, 0xa1, 0xed, 0xf1, 0x1c, 0xbd, 0xa9, 0x07, 0xf1, 0x16, 0xc9, 0xf2 }; u8 res[16]; int err; err = smp_f4(tfm_cmac, u, v, x, z, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static int __init test_f5(struct crypto_shash *tfm_cmac) { const u8 w[32] = { 0x98, 0xa6, 0xbf, 0x73, 0xf3, 0x34, 0x8d, 0x86, 0xf1, 0x66, 0xf8, 0xb4, 0x13, 0x6b, 0x79, 0x99, 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec }; const u8 n1[16] = { 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff, 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 }; const u8 n2[16] = { 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21, 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 }; const u8 a1[7] = { 0xce, 0xbf, 0x37, 0x37, 0x12, 0x56, 0x00 }; const u8 a2[7] = { 0xc1, 0xcf, 0x2d, 0x70, 0x13, 0xa7, 0x00 }; const u8 exp_ltk[16] = { 0x38, 0x0a, 0x75, 0x94, 0xb5, 0x22, 0x05, 0x98, 0x23, 0xcd, 0xd7, 0x69, 0x11, 0x79, 0x86, 0x69 }; const u8 exp_mackey[16] = { 0x20, 0x6e, 0x63, 0xce, 0x20, 0x6a, 0x3f, 0xfd, 0x02, 0x4a, 0x08, 0xa1, 0x76, 0xf1, 0x65, 0x29 }; u8 mackey[16], ltk[16]; int err; err = smp_f5(tfm_cmac, w, n1, n2, a1, a2, mackey, ltk); if (err) return err; if (crypto_memneq(mackey, exp_mackey, 16)) return -EINVAL; if (crypto_memneq(ltk, exp_ltk, 16)) return -EINVAL; return 0; } static int __init test_f6(struct crypto_shash *tfm_cmac) { const u8 w[16] = { 0x20, 0x6e, 0x63, 0xce, 0x20, 0x6a, 0x3f, 0xfd, 0x02, 0x4a, 0x08, 0xa1, 0x76, 0xf1, 0x65, 0x29 }; const u8 n1[16] = { 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff, 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 }; const u8 n2[16] = { 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21, 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 }; const u8 r[16] = { 0xc8, 0x0f, 0x2d, 0x0c, 0xd2, 0x42, 0xda, 0x08, 0x54, 0xbb, 0x53, 0xb4, 0x3b, 0x34, 0xa3, 0x12 }; const u8 io_cap[3] = { 0x02, 0x01, 0x01 }; const u8 a1[7] = { 0xce, 0xbf, 0x37, 0x37, 0x12, 0x56, 0x00 }; const u8 a2[7] = { 0xc1, 0xcf, 0x2d, 0x70, 0x13, 0xa7, 0x00 }; const u8 exp[16] = { 0x61, 0x8f, 0x95, 0xda, 0x09, 0x0b, 0x6c, 0xd2, 0xc5, 0xe8, 0xd0, 0x9c, 0x98, 0x73, 0xc4, 0xe3 }; u8 res[16]; int err; err = smp_f6(tfm_cmac, w, n1, n2, r, io_cap, a1, a2, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static int __init test_g2(struct crypto_shash *tfm_cmac) { const u8 u[32] = { 0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc, 0xdb, 0xfd, 0xf4, 0xac, 0x11, 0x91, 0xf4, 0xef, 0xb9, 0xa5, 0xf9, 0xe9, 0xa7, 0x83, 0x2c, 0x5e, 0x2c, 0xbe, 0x97, 0xf2, 0xd2, 0x03, 0xb0, 0x20 }; const u8 v[32] = { 0xfd, 0xc5, 0x7f, 0xf4, 0x49, 0xdd, 0x4f, 0x6b, 0xfb, 0x7c, 0x9d, 0xf1, 0xc2, 0x9a, 0xcb, 0x59, 0x2a, 0xe7, 0xd4, 0xee, 0xfb, 0xfc, 0x0a, 0x90, 0x9a, 0xbb, 0xf6, 0x32, 0x3d, 0x8b, 0x18, 0x55 }; const u8 x[16] = { 0xab, 0xae, 0x2b, 0x71, 0xec, 0xb2, 0xff, 0xff, 0x3e, 0x73, 0x77, 0xd1, 0x54, 0x84, 0xcb, 0xd5 }; const u8 y[16] = { 0xcf, 0xc4, 0x3d, 0xff, 0xf7, 0x83, 0x65, 0x21, 0x6e, 0x5f, 0xa7, 0x25, 0xcc, 0xe7, 0xe8, 0xa6 }; const u32 exp_val = 0x2f9ed5ba % 1000000; u32 val; int err; err = smp_g2(tfm_cmac, u, v, x, y, &val); if (err) return err; if (val != exp_val) return -EINVAL; return 0; } static int __init test_h6(struct crypto_shash *tfm_cmac) { const u8 w[16] = { 0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34, 0x05, 0xad, 0xc8, 0x57, 0xa3, 0x34, 0x02, 0xec }; const u8 key_id[4] = { 0x72, 0x62, 0x65, 0x6c }; const u8 exp[16] = { 0x99, 0x63, 0xb1, 0x80, 0xe2, 0xa9, 0xd3, 0xe8, 0x1c, 0xc9, 0x6d, 0xe7, 0x02, 0xe1, 0x9a, 0x2d }; u8 res[16]; int err; err = smp_h6(tfm_cmac, w, key_id, res); if (err) return err; if (crypto_memneq(res, exp, 16)) return -EINVAL; return 0; } static char test_smp_buffer[32]; static ssize_t test_smp_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { return simple_read_from_buffer(user_buf, count, ppos, test_smp_buffer, strlen(test_smp_buffer)); } static const struct file_operations test_smp_fops = { .open = simple_open, .read = test_smp_read, .llseek = default_llseek, }; static int __init run_selftests(struct crypto_shash *tfm_cmac, struct crypto_kpp *tfm_ecdh) { ktime_t calltime, delta, rettime; unsigned long long duration; int err; calltime = ktime_get(); err = test_debug_key(tfm_ecdh); if (err) { BT_ERR("debug_key test failed"); goto done; } err = test_ah(); if (err) { BT_ERR("smp_ah test failed"); goto done; } err = test_c1(); if (err) { BT_ERR("smp_c1 test failed"); goto done; } err = test_s1(); if (err) { BT_ERR("smp_s1 test failed"); goto done; } err = test_f4(tfm_cmac); if (err) { BT_ERR("smp_f4 test failed"); goto done; } err = test_f5(tfm_cmac); if (err) { BT_ERR("smp_f5 test failed"); goto done; } err = test_f6(tfm_cmac); if (err) { BT_ERR("smp_f6 test failed"); goto done; } err = test_g2(tfm_cmac); if (err) { BT_ERR("smp_g2 test failed"); goto done; } err = test_h6(tfm_cmac); if (err) { BT_ERR("smp_h6 test failed"); goto done; } rettime = ktime_get(); delta = ktime_sub(rettime, calltime); duration = (unsigned long long) ktime_to_ns(delta) >> 10; BT_INFO("SMP test passed in %llu usecs", duration); done: if (!err) snprintf(test_smp_buffer, sizeof(test_smp_buffer), "PASS (%llu usecs)\n", duration); else snprintf(test_smp_buffer, sizeof(test_smp_buffer), "FAIL\n"); debugfs_create_file("selftest_smp", 0444, bt_debugfs, NULL, &test_smp_fops); return err; } int __init bt_selftest_smp(void) { struct crypto_shash *tfm_cmac; struct crypto_kpp *tfm_ecdh; int err; tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { BT_ERR("Unable to create CMAC crypto context"); return PTR_ERR(tfm_cmac); } tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0); if (IS_ERR(tfm_ecdh)) { BT_ERR("Unable to create ECDH crypto context"); crypto_free_shash(tfm_cmac); return PTR_ERR(tfm_ecdh); } err = run_selftests(tfm_cmac, tfm_ecdh); crypto_free_shash(tfm_cmac); crypto_free_kpp(tfm_ecdh); return err; } #endif
3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 // SPDX-License-Identifier: GPL-2.0-or-later /* * lib/plist.c * * Descending-priority-sorted double-linked list * * (C) 2002-2003 Intel Corp * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>. * * 2001-2005 (c) MontaVista Software, Inc. * Daniel Walker <dwalker@mvista.com> * * (C) 2005 Thomas Gleixner <tglx@linutronix.de> * * Simplifications of the original code by * Oleg Nesterov <oleg@tv-sign.ru> * * Based on simple lists (include/linux/list.h). * * This file contains the add / del functions which are considered to * be too large to inline. See include/linux/plist.h for further * information. */ #include <linux/bug.h> #include <linux/plist.h> #ifdef CONFIG_DEBUG_PLIST static struct plist_head test_head; static void plist_check_prev_next(struct list_head *t, struct list_head *p, struct list_head *n) { WARN(n->prev != p || p->next != n, "top: %p, n: %p, p: %p\n" "prev: %p, n: %p, p: %p\n" "next: %p, n: %p, p: %p\n", t, t->next, t->prev, p, p->next, p->prev, n, n->next, n->prev); } static void plist_check_list(struct list_head *top) { struct list_head *prev = top, *next = top->next; plist_check_prev_next(top, prev, next); while (next != top) { WRITE_ONCE(prev, next); WRITE_ONCE(next, prev->next); plist_check_prev_next(top, prev, next); } } static void plist_check_head(struct plist_head *head) { if (!plist_head_empty(head)) plist_check_list(&plist_first(head)->prio_list); plist_check_list(&head->node_list); } #else # define plist_check_head(h) do { } while (0) #endif /** * plist_add - add @node to @head * * @node: &struct plist_node pointer * @head: &struct plist_head pointer */ void plist_add(struct plist_node *node, struct plist_head *head) { struct plist_node *first, *iter, *prev = NULL, *last, *reverse_iter; struct list_head *node_next = &head->node_list; plist_check_head(head); WARN_ON(!plist_node_empty(node)); WARN_ON(!list_empty(&node->prio_list)); if (plist_head_empty(head)) goto ins_node; first = iter = plist_first(head); last = reverse_iter = list_entry(first->prio_list.prev, struct plist_node, prio_list); do { if (node->prio < iter->prio) { node_next = &iter->node_list; break; } else if (node->prio >= reverse_iter->prio) { prev = reverse_iter; iter = list_entry(reverse_iter->prio_list.next, struct plist_node, prio_list); if (likely(reverse_iter != last)) node_next = &iter->node_list; break; } prev = iter; iter = list_entry(iter->prio_list.next, struct plist_node, prio_list); reverse_iter = list_entry(reverse_iter->prio_list.prev, struct plist_node, prio_list); } while (iter != first); if (!prev || prev->prio != node->prio) list_add_tail(&node->prio_list, &iter->prio_list); ins_node: list_add_tail(&node->node_list, node_next); plist_check_head(head); } /** * plist_del - Remove a @node from plist. * * @node: &struct plist_node pointer - entry to be removed * @head: &struct plist_head pointer - list head */ void plist_del(struct plist_node *node, struct plist_head *head) { plist_check_head(head); if (!list_empty(&node->prio_list)) { if (node->node_list.next != &head->node_list) { struct plist_node *next; next = list_entry(node->node_list.next, struct plist_node, node_list); /* add the next plist_node into prio_list */ if (list_empty(&next->prio_list)) list_add(&next->prio_list, &node->prio_list); } list_del_init(&node->prio_list); } list_del_init(&node->node_list); plist_check_head(head); } /** * plist_requeue - Requeue @node at end of same-prio entries. * * This is essentially an optimized plist_del() followed by * plist_add(). It moves an entry already in the plist to * after any other same-priority entries. * * @node: &struct plist_node pointer - entry to be moved * @head: &struct plist_head pointer - list head */ void plist_requeue(struct plist_node *node, struct plist_head *head) { struct plist_node *iter; struct list_head *node_next = &head->node_list; plist_check_head(head); BUG_ON(plist_head_empty(head)); BUG_ON(plist_node_empty(node)); if (node == plist_last(head)) return; iter = plist_next(node); if (node->prio != iter->prio) return; plist_del(node, head); plist_for_each_continue(iter, head) { if (node->prio != iter->prio) { node_next = &iter->node_list; break; } } list_add_tail(&node->node_list, node_next); plist_check_head(head); } #ifdef CONFIG_DEBUG_PLIST #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/module.h> #include <linux/init.h> static struct plist_node __initdata test_node[241]; static void __init plist_test_check(int nr_expect) { struct plist_node *first, *prio_pos, *node_pos; if (plist_head_empty(&test_head)) { BUG_ON(nr_expect != 0); return; } prio_pos = first = plist_first(&test_head); plist_for_each(node_pos, &test_head) { if (nr_expect-- < 0) break; if (node_pos == first) continue; if (node_pos->prio == prio_pos->prio) { BUG_ON(!list_empty(&node_pos->prio_list)); continue; } BUG_ON(prio_pos->prio > node_pos->prio); BUG_ON(prio_pos->prio_list.next != &node_pos->prio_list); prio_pos = node_pos; } BUG_ON(nr_expect != 0); BUG_ON(prio_pos->prio_list.next != &first->prio_list); } static void __init plist_test_requeue(struct plist_node *node) { plist_requeue(node, &test_head); if (node != plist_last(&test_head)) BUG_ON(node->prio == plist_next(node)->prio); } static int __init plist_test(void) { int nr_expect = 0, i, loop; unsigned int r = local_clock(); printk(KERN_DEBUG "start plist test\n"); plist_head_init(&test_head); for (i = 0; i < ARRAY_SIZE(test_node); i++) plist_node_init(test_node + i, 0); for (loop = 0; loop < 1000; loop++) { r = r * 193939 % 47629; i = r % ARRAY_SIZE(test_node); if (plist_node_empty(test_node + i)) { r = r * 193939 % 47629; test_node[i].prio = r % 99; plist_add(test_node + i, &test_head); nr_expect++; } else { plist_del(test_node + i, &test_head); nr_expect--; } plist_test_check(nr_expect); if (!plist_node_empty(test_node + i)) { plist_test_requeue(test_node + i); plist_test_check(nr_expect); } } for (i = 0; i < ARRAY_SIZE(test_node); i++) { if (plist_node_empty(test_node + i)) continue; plist_del(test_node + i, &test_head); nr_expect--; plist_test_check(nr_expect); } printk(KERN_DEBUG "end plist test\n"); /* Worst case test for plist_add() */ unsigned int test_data[241]; for (i = 0; i < ARRAY_SIZE(test_data); i++) test_data[i] = i; ktime_t start, end, time_elapsed = 0; plist_head_init(&test_head); for (i = 0; i < ARRAY_SIZE(test_node); i++) { plist_node_init(test_node + i, 0); test_node[i].prio = test_data[i]; } for (i = 0; i < ARRAY_SIZE(test_node); i++) { if (plist_node_empty(test_node + i)) { start = ktime_get(); plist_add(test_node + i, &test_head); end = ktime_get(); time_elapsed += (end - start); } } pr_debug("plist_add worst case test time elapsed %lld\n", time_elapsed); return 0; } module_init(plist_test); #endif
57 37 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BPF_CGROUP_H #define _BPF_CGROUP_H #include <linux/bpf.h> #include <linux/bpf-cgroup-defs.h> #include <linux/errno.h> #include <linux/jump_label.h> #include <linux/percpu.h> #include <linux/rbtree.h> #include <net/sock.h> #include <uapi/linux/bpf.h> struct sock; struct sockaddr; struct cgroup; struct sk_buff; struct bpf_map; struct bpf_prog; struct bpf_sock_ops_kern; struct bpf_cgroup_storage; struct ctl_table; struct ctl_table_header; struct task_struct; unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, const struct bpf_insn *insn); unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, const struct bpf_insn *insn); unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, const struct bpf_insn *insn); #ifdef CONFIG_CGROUP_BPF #define CGROUP_ATYPE(type) \ case BPF_##type: return type static inline enum cgroup_bpf_attach_type to_cgroup_bpf_attach_type(enum bpf_attach_type attach_type) { switch (attach_type) { CGROUP_ATYPE(CGROUP_INET_INGRESS); CGROUP_ATYPE(CGROUP_INET_EGRESS); CGROUP_ATYPE(CGROUP_INET_SOCK_CREATE); CGROUP_ATYPE(CGROUP_SOCK_OPS); CGROUP_ATYPE(CGROUP_DEVICE); CGROUP_ATYPE(CGROUP_INET4_BIND); CGROUP_ATYPE(CGROUP_INET6_BIND); CGROUP_ATYPE(CGROUP_INET4_CONNECT); CGROUP_ATYPE(CGROUP_INET6_CONNECT); CGROUP_ATYPE(CGROUP_UNIX_CONNECT); CGROUP_ATYPE(CGROUP_INET4_POST_BIND); CGROUP_ATYPE(CGROUP_INET6_POST_BIND); CGROUP_ATYPE(CGROUP_UDP4_SENDMSG); CGROUP_ATYPE(CGROUP_UDP6_SENDMSG); CGROUP_ATYPE(CGROUP_UNIX_SENDMSG); CGROUP_ATYPE(CGROUP_SYSCTL); CGROUP_ATYPE(CGROUP_UDP4_RECVMSG); CGROUP_ATYPE(CGROUP_UDP6_RECVMSG); CGROUP_ATYPE(CGROUP_UNIX_RECVMSG); CGROUP_ATYPE(CGROUP_GETSOCKOPT); CGROUP_ATYPE(CGROUP_SETSOCKOPT); CGROUP_ATYPE(CGROUP_INET4_GETPEERNAME); CGROUP_ATYPE(CGROUP_INET6_GETPEERNAME); CGROUP_ATYPE(CGROUP_UNIX_GETPEERNAME); CGROUP_ATYPE(CGROUP_INET4_GETSOCKNAME); CGROUP_ATYPE(CGROUP_INET6_GETSOCKNAME); CGROUP_ATYPE(CGROUP_UNIX_GETSOCKNAME); CGROUP_ATYPE(CGROUP_INET_SOCK_RELEASE); default: return CGROUP_BPF_ATTACH_TYPE_INVALID; } } #undef CGROUP_ATYPE extern struct static_key_false cgroup_bpf_enabled_key[MAX_CGROUP_BPF_ATTACH_TYPE]; #define cgroup_bpf_enabled(atype) static_branch_unlikely(&cgroup_bpf_enabled_key[atype]) #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) struct bpf_cgroup_storage_map; struct bpf_storage_buffer { struct rcu_head rcu; char data[]; }; struct bpf_cgroup_storage { union { struct bpf_storage_buffer *buf; void __percpu *percpu_buf; }; struct bpf_cgroup_storage_map *map; struct bpf_cgroup_storage_key key; struct list_head list_map; struct list_head list_cg; struct rb_node node; struct rcu_head rcu; }; struct bpf_cgroup_link { struct bpf_link link; struct cgroup *cgroup; enum bpf_attach_type type; }; struct bpf_prog_list { struct hlist_node node; struct bpf_prog *prog; struct bpf_cgroup_link *link; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; }; int cgroup_bpf_inherit(struct cgroup *cgrp); void cgroup_bpf_offline(struct cgroup *cgrp); int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sk(struct sock *sk, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, int *uaddrlen, enum cgroup_bpf_attach_type atype, void *t_ctx, u32 *flags); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, const struct ctl_table *table, int write, char **buf, size_t *pcount, loff_t *ppos, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, int *optname, sockptr_t optval, int *optlen, char **kernel_optval); int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen, int max_optlen, int retval); int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, int optname, void *optval, int *optlen, int retval); static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) return BPF_CGROUP_STORAGE_PERCPU; return BPF_CGROUP_STORAGE_SHARED; } struct bpf_cgroup_storage * cgroup_storage_lookup(struct bpf_cgroup_storage_map *map, void *key, bool locked); struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype); void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, struct cgroup *cgroup, enum bpf_attach_type type); void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map); int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags); /* Opportunistic check to see whether we have any BPF program attached*/ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, enum cgroup_bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); struct bpf_prog_array *array; array = rcu_access_pointer(cgrp->bpf.effective[type]); return array != &bpf_empty_prog_array.hdr; } /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && \ cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS) && sk && \ sk_fullsock(sk)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ CGROUP_INET_INGRESS); \ \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (__sk && __sk == skb_to_full_sk(skb) && \ cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ CGROUP_INET_EGRESS); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_SK_PROG(sk, atype) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ __ret = __cgroup_bpf_run_filter_sk(sk, atype); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_CREATE) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET_SOCK_RELEASE) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET4_POST_BIND) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND) #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ atype, NULL, NULL); \ __ret; \ }) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ atype, t_ctx, NULL); \ release_sock(sk); \ } \ __ret; \ }) /* BPF_CGROUP_INET4_BIND and BPF_CGROUP_INET6_BIND can return extra flags * via upper bits of return code. The only flag that is supported * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE). */ #define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, bind_flags) \ ({ \ u32 __flags = 0; \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ atype, NULL, &__flags); \ release_sock(sk); \ if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ *bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE; \ } \ __ret; \ }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ ((cgroup_bpf_enabled(CGROUP_INET4_CONNECT) || \ cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) && \ (sk)->sk_prot->pre_connect) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL) #define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UNIX_RECVMSG, NULL) /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a * fullsock and its parent fullsock cannot be traced by * sk_to_full_sk(). * * e.g. sock_ops->sk is a request_sock and it is under syncookie mode. * Its listener-sk is not attached to the rsk_listener. * In this case, the caller holds the listener-sk (unlocked), * set its sock_ops->sk to req_sk, and call this SOCK_OPS"_SK" with * the listener-sk such that the cgroup-bpf-progs of the * listener-sk will be run. * * Regardless of syncookie mode or not, * calling bpf_setsockopt on listener-sk will not make sense anyway, * so passing 'sock_ops->sk == req_sk' to the bpf prog is appropriate here. */ #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_SOCK_OPS)) \ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ sock_ops, \ CGROUP_SOCK_OPS); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ sock_ops, \ CGROUP_SOCK_OPS); \ } \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_DEVICE)) \ __ret = __cgroup_bpf_check_dev_permission(atype, major, minor, \ access, \ CGROUP_DEVICE); \ \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_SYSCTL)) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ buf, count, pos, \ CGROUP_SYSCTL); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ kernel_optval) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT) && \ cgroup_bpf_sock_enabled(sock, CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ kernel_optval); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen, \ max_optlen, retval) \ ({ \ int __ret = retval; \ if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT) && \ cgroup_bpf_sock_enabled(sock, CGROUP_GETSOCKOPT)) \ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ tcp_bpf_bypass_getsockopt, \ level, optname)) \ __ret = __cgroup_bpf_run_filter_getsockopt( \ sock, level, optname, optval, optlen, \ max_optlen, retval); \ __ret; \ }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ optlen, retval) \ ({ \ int __ret = retval; \ if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ sock, level, optname, optval, optlen, retval); \ __ret; \ }) int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); const struct bpf_func_proto * cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); const struct bpf_func_proto * cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); #else static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog) { return -EINVAL; } static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { return -EINVAL; } static inline int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { return -EINVAL; } static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { return -EINVAL; } static inline const struct bpf_func_proto * cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return NULL; } static inline const struct bpf_func_proto * cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return NULL; } static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; } static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value) { return 0; } static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags) { return 0; } #define cgroup_bpf_enabled(atype) (0) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, flags) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ optlen, retval) ({ retval; }) #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ kernel_optval) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) #endif /* CONFIG_CGROUP_BPF */ #endif /* _BPF_CGROUP_H */
40 39 35 40 31 41 40 40 1 1 1 1 1 1 1 1 62 64 27 3 25 1 26 40 40 1 38 7 7 82 40 48 48 64 63 63 25 25 25 25 48 48 25 40 64 64 64 1 18 12 4 4 4 4 1 3 3 1 1 1 1 1 1 1 1 1 64 62 3 1 2 2 1