Total coverage: 237959 (12%)of 2040364
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_LEGACY_H #define __X86_KERNEL_FPU_LEGACY_H #include <asm/fpu/types.h> extern unsigned int mxcsr_feature_mask; static inline void ldmxcsr(u32 mxcsr) { asm volatile("ldmxcsr %0" :: "m" (mxcsr)); } /* * Returns 0 on success or the trap number when the operation raises an * exception. */ #define user_insn(insn, output, input...) \ ({ \ int err; \ \ might_fault(); \ \ asm volatile(ASM_STAC "\n" \ "1: " #insn "\n" \ "2: " ASM_CLAC "\n" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ : [err] "=a" (err), output \ : "0"(0), input); \ err; \ }) #define kernel_insn_err(insn, output, input...) \ ({ \ int err; \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[err]) \ : [err] "=r" (err), output \ : "0"(0), input); \ err; \ }) #define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \ : output : input) static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) { return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); } static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); else return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); } static inline void fxrstor(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int fxrstor_safe(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline void frstor(struct fregs_state *fx) { kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int frstor_safe(struct fregs_state *fx) { return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) { return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline void fxsave(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); else asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); } #endif
11 11 8 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 // SPDX-License-Identifier: GPL-2.0-only /* loopback transport for vsock using virtio_transport_common APIs * * Copyright (C) 2013-2019 Red Hat, Inc. * Authors: Asias He <asias@redhat.com> * Stefan Hajnoczi <stefanha@redhat.com> * Stefano Garzarella <sgarzare@redhat.com> * */ #include <linux/spinlock.h> #include <linux/module.h> #include <linux/list.h> #include <linux/virtio_vsock.h> struct vsock_loopback { struct workqueue_struct *workqueue; struct sk_buff_head pkt_queue; struct work_struct pkt_work; }; static struct vsock_loopback the_vsock_loopback; static u32 vsock_loopback_get_local_cid(void) { return VMADDR_CID_LOCAL; } static int vsock_loopback_send_pkt(struct sk_buff *skb) { struct vsock_loopback *vsock = &the_vsock_loopback; int len = skb->len; virtio_vsock_skb_queue_tail(&vsock->pkt_queue, skb); queue_work(vsock->workqueue, &vsock->pkt_work); return len; } static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) { struct vsock_loopback *vsock = &the_vsock_loopback; virtio_transport_purge_skbs(vsk, &vsock->pkt_queue); return 0; } static bool vsock_loopback_seqpacket_allow(u32 remote_cid); static bool vsock_loopback_msgzerocopy_allow(void) { return true; } static struct virtio_transport loopback_transport = { .transport = { .module = THIS_MODULE, .get_local_cid = vsock_loopback_get_local_cid, .init = virtio_transport_do_socket_init, .destruct = virtio_transport_destruct, .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, .cancel_pkt = vsock_loopback_cancel_pkt, .dgram_bind = virtio_transport_dgram_bind, .dgram_dequeue = virtio_transport_dgram_dequeue, .dgram_enqueue = virtio_transport_dgram_enqueue, .dgram_allow = virtio_transport_dgram_allow, .stream_dequeue = virtio_transport_stream_dequeue, .stream_enqueue = virtio_transport_stream_enqueue, .stream_has_data = virtio_transport_stream_has_data, .stream_has_space = virtio_transport_stream_has_space, .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, .seqpacket_allow = vsock_loopback_seqpacket_allow, .seqpacket_has_data = virtio_transport_seqpacket_has_data, .msgzerocopy_allow = vsock_loopback_msgzerocopy_allow, .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, .notify_send_init = virtio_transport_notify_send_init, .notify_send_pre_block = virtio_transport_notify_send_pre_block, .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .unsent_bytes = virtio_transport_unsent_bytes, .read_skb = virtio_transport_read_skb, }, .send_pkt = vsock_loopback_send_pkt, }; static bool vsock_loopback_seqpacket_allow(u32 remote_cid) { return true; } static void vsock_loopback_work(struct work_struct *work) { struct vsock_loopback *vsock = container_of(work, struct vsock_loopback, pkt_work); struct sk_buff_head pkts; struct sk_buff *skb; skb_queue_head_init(&pkts); spin_lock_bh(&vsock->pkt_queue.lock); skb_queue_splice_init(&vsock->pkt_queue, &pkts); spin_unlock_bh(&vsock->pkt_queue.lock); while ((skb = __skb_dequeue(&pkts))) { /* Decrement the bytes_unsent counter without deallocating skb * It is freed by the receiver. */ virtio_transport_consume_skb_sent(skb, false); virtio_transport_deliver_tap_pkt(skb); virtio_transport_recv_pkt(&loopback_transport, skb); } } static int __init vsock_loopback_init(void) { struct vsock_loopback *vsock = &the_vsock_loopback; int ret; vsock->workqueue = alloc_workqueue("vsock-loopback", WQ_PERCPU, 0); if (!vsock->workqueue) return -ENOMEM; skb_queue_head_init(&vsock->pkt_queue); INIT_WORK(&vsock->pkt_work, vsock_loopback_work); ret = vsock_core_register(&loopback_transport.transport, VSOCK_TRANSPORT_F_LOCAL); if (ret) goto out_wq; return 0; out_wq: destroy_workqueue(vsock->workqueue); return ret; } static void __exit vsock_loopback_exit(void) { struct vsock_loopback *vsock = &the_vsock_loopback; vsock_core_unregister(&loopback_transport.transport); flush_work(&vsock->pkt_work); virtio_vsock_skb_queue_purge(&vsock->pkt_queue); destroy_workqueue(vsock->workqueue); } module_init(vsock_loopback_init); module_exit(vsock_loopback_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Stefano Garzarella <sgarzare@redhat.com>"); MODULE_DESCRIPTION("loopback transport for vsock"); MODULE_ALIAS_NETPROTO(PF_VSOCK);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0-only /* * crash.c - kernel crash support code. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> */ #include <linux/buildid.h> #include <linux/init.h> #include <linux/utsname.h> #include <linux/vmalloc.h> #include <linux/sizes.h> #include <linux/kexec.h> #include <linux/memory.h> #include <linux/cpuhotplug.h> #include <linux/memblock.h> #include <linux/kmemleak.h> #include <asm/page.h> #include <asm/sections.h> #include <crypto/sha1.h> #include "kallsyms_internal.h" #include "kexec_internal.h" /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; size_t vmcoreinfo_size; u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ static unsigned char *vmcoreinfo_data_safecopy; struct hwerr_info { atomic_t count; time64_t timestamp; }; static struct hwerr_info hwerr_data[HWERR_RECOV_MAX]; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { struct elf_note *note = (struct elf_note *)buf; note->n_namesz = strlen(name) + 1; note->n_descsz = data_len; note->n_type = type; buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); memcpy(buf, name, note->n_namesz); buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); memcpy(buf, data, data_len); buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); return buf; } void final_note(Elf_Word *buf) { memset(buf, 0, sizeof(struct elf_note)); } static void update_vmcoreinfo_note(void) { u32 *buf = vmcoreinfo_note; if (!vmcoreinfo_size) return; buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, vmcoreinfo_size); final_note(buf); } void crash_update_vmcoreinfo_safecopy(void *ptr) { if (ptr) memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size); vmcoreinfo_data_safecopy = ptr; } void crash_save_vmcoreinfo(void) { if (!vmcoreinfo_note) return; /* Use the safe copy to generate vmcoreinfo note if have */ if (vmcoreinfo_data_safecopy) vmcoreinfo_data = vmcoreinfo_data_safecopy; vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds()); update_vmcoreinfo_note(); } void vmcoreinfo_append_str(const char *fmt, ...) { va_list args; char buf[0x50]; size_t r; va_start(args, fmt); r = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size); memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); vmcoreinfo_size += r; WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES, "vmcoreinfo data exceeds allocated size, truncating"); } /* * provide an empty default implementation here -- architecture * code may override this */ void __weak arch_crash_save_vmcoreinfo(void) {} phys_addr_t __weak paddr_vmcoreinfo_note(void) { return __pa(vmcoreinfo_note); } EXPORT_SYMBOL(paddr_vmcoreinfo_note); void hwerr_log_error_type(enum hwerr_error_type src) { if (src < 0 || src >= HWERR_RECOV_MAX) return; atomic_inc(&hwerr_data[src].count); WRITE_ONCE(hwerr_data[src].timestamp, ktime_get_real_seconds()); } EXPORT_SYMBOL_GPL(hwerr_log_error_type); static int __init crash_save_vmcoreinfo_init(void) { vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); if (!vmcoreinfo_data) { pr_warn("Memory allocation for vmcoreinfo_data failed\n"); return -ENOMEM; } vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, GFP_KERNEL | __GFP_ZERO); if (!vmcoreinfo_note) { free_page((unsigned long)vmcoreinfo_data); vmcoreinfo_data = NULL; pr_warn("Memory allocation for vmcoreinfo_note failed\n"); return -ENOMEM; } VMCOREINFO_OSRELEASE(init_uts_ns.name.release); VMCOREINFO_BUILD_ID(); VMCOREINFO_PAGESIZE(PAGE_SIZE); VMCOREINFO_SYMBOL(init_uts_ns); VMCOREINFO_OFFSET(uts_namespace, name); VMCOREINFO_SYMBOL(node_online_map); #ifdef CONFIG_MMU VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); vmcoreinfo_append_str("NUMBER(VMALLOC_START)=0x%lx\n", (unsigned long) VMALLOC_START); #ifndef CONFIG_NUMA VMCOREINFO_SYMBOL(mem_map); VMCOREINFO_SYMBOL(contig_page_data); #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP VMCOREINFO_SYMBOL_ARRAY(vmemmap); #endif #ifdef CONFIG_SPARSEMEM VMCOREINFO_SYMBOL_ARRAY(mem_section); VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); VMCOREINFO_NUMBER(SECTION_SIZE_BITS); VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); #endif VMCOREINFO_STRUCT_SIZE(page); VMCOREINFO_STRUCT_SIZE(pglist_data); VMCOREINFO_STRUCT_SIZE(zone); VMCOREINFO_STRUCT_SIZE(free_area); VMCOREINFO_STRUCT_SIZE(list_head); VMCOREINFO_SIZE(nodemask_t); VMCOREINFO_OFFSET(page, flags); VMCOREINFO_OFFSET(page, _refcount); VMCOREINFO_OFFSET(page, mapping); VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); #ifdef CONFIG_FLATMEM VMCOREINFO_OFFSET(pglist_data, node_mem_map); #endif VMCOREINFO_OFFSET(pglist_data, node_start_pfn); VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); VMCOREINFO_OFFSET(pglist_data, node_id); VMCOREINFO_OFFSET(zone, free_area); VMCOREINFO_OFFSET(zone, vm_stat); VMCOREINFO_OFFSET(zone, spanned_pages); VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); log_buf_vmcoreinfo_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); VMCOREINFO_NUMBER(PG_lru); VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); VMCOREINFO_NUMBER(PG_swapbacked); #define PAGE_SLAB_MAPCOUNT_VALUE (PGTY_slab << 24) VMCOREINFO_NUMBER(PAGE_SLAB_MAPCOUNT_VALUE); #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); #endif VMCOREINFO_NUMBER(PG_head_mask); #define PAGE_BUDDY_MAPCOUNT_VALUE (PGTY_buddy << 24) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #define PAGE_HUGETLB_MAPCOUNT_VALUE (PGTY_hugetlb << 24) VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE); #define PAGE_OFFLINE_MAPCOUNT_VALUE (PGTY_offline << 24) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #ifdef CONFIG_UNACCEPTED_MEMORY #define PAGE_UNACCEPTED_MAPCOUNT_VALUE (PGTY_unaccepted << 24) VMCOREINFO_NUMBER(PAGE_UNACCEPTED_MAPCOUNT_VALUE); #endif #ifdef CONFIG_KALLSYMS VMCOREINFO_SYMBOL(kallsyms_names); VMCOREINFO_SYMBOL(kallsyms_num_syms); VMCOREINFO_SYMBOL(kallsyms_token_table); VMCOREINFO_SYMBOL(kallsyms_token_index); VMCOREINFO_SYMBOL(kallsyms_offsets); VMCOREINFO_SYMBOL(kallsyms_relative_base); #endif /* CONFIG_KALLSYMS */ arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); return 0; } subsys_initcall(crash_save_vmcoreinfo_init);
121 121 121 121 121 121 121 121 121 120 120 1 121 1 120 121 121 121 10 10 10 10 10 121 10 121 121 121 121 121 10 121 120 121 121 10 121 163 22 22 22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 // SPDX-License-Identifier: GPL-2.0 /* * Functions to sequence PREFLUSH and FUA writes. * * Copyright (C) 2011 Max Planck Institute for Gravitational Physics * Copyright (C) 2011 Tejun Heo <tj@kernel.org> * * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request * properties and hardware capability. * * If a request doesn't have data, only REQ_PREFLUSH makes sense, which * indicates a simple flush request. If there is data, REQ_PREFLUSH indicates * that the device cache should be flushed before the data is executed, and * REQ_FUA means that the data must be on non-volatile media on request * completion. * * If the device doesn't have writeback cache, PREFLUSH and FUA don't make any * difference. The requests are either completed immediately if there's no data * or executed as normal requests otherwise. * * If the device has writeback cache and supports FUA, REQ_PREFLUSH is * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. * * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH * is translated to PREFLUSH and REQ_FUA to POSTFLUSH. * * The actual execution of flush is double buffered. Whenever a request * needs to execute PRE or POSTFLUSH, it queues at * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a * REQ_OP_FLUSH is issued and the pending_idx is toggled. When the flush * completes, all the requests which were pending are proceeded to the next * step. This allows arbitrary merging of different types of PREFLUSH/FUA * requests. * * Currently, the following conditions are used to determine when to issue * flush. * * C1. At any given time, only one flush shall be in progress. This makes * double buffering sufficient. * * C2. Flush is deferred if any request is executing DATA of its sequence. * This avoids issuing separate POSTFLUSHes for requests which shared * PREFLUSH. * * C3. The second condition is ignored if there is a request which has * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid * starvation in the unlikely case where there are continuous stream of * FUA (without PREFLUSH) requests. * * For devices which support FUA, it isn't clear whether C2 (and thus C3) * is beneficial. * * Note that a sequenced PREFLUSH/FUA request with DATA is completed twice. * Once while executing DATA and again after the whole sequence is * complete. The first completion updates the contained bio but doesn't * finish it so that the bio submitter is notified only after the whole * sequence is complete. This is implemented by testing RQF_FLUSH_SEQ in * req_bio_endio(). * * The above peculiarity requires that each PREFLUSH/FUA request has only one * bio attached to it, which is guaranteed as they aren't allowed to be * merged in the usual way. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/gfp.h> #include <linux/part_stat.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" /* PREFLUSH/FUA sequences */ enum { REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */ REQ_FSEQ_DONE = (1 << 3), REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH, /* * If flush has been pending longer than the following timeout, * it's issued even if flush_data requests are still in flight. */ FLUSH_PENDING_TIMEOUT = 5 * HZ, }; static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_opf_t flags); static inline struct blk_flush_queue * blk_get_flush_queue(struct blk_mq_ctx *ctx) { return blk_mq_map_queue(REQ_OP_FLUSH, ctx)->fq; } static unsigned int blk_flush_cur_seq(struct request *rq) { return 1 << ffz(rq->flush.seq); } static void blk_flush_restore_request(struct request *rq) { /* * After flush data completion, @rq->bio is %NULL but we need to * complete the bio again. @rq->biotail is guaranteed to equal the * original @rq->bio. Restore it. */ rq->bio = rq->biotail; if (rq->bio) rq->__sector = rq->bio->bi_iter.bi_sector; /* make @rq a normal request */ rq->rq_flags &= ~RQF_FLUSH_SEQ; rq->end_io = rq->flush.saved_end_io; } static void blk_account_io_flush(struct request *rq) { struct block_device *part = rq->q->disk->part0; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); part_stat_add(part, nsecs[STAT_FLUSH], blk_time_get_ns() - rq->start_time_ns); part_stat_unlock(); } /** * blk_flush_complete_seq - complete flush sequence * @rq: PREFLUSH/FUA request being sequenced * @fq: flush queue * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) * @error: whether an error occurred * * @rq just completed @seq part of its flush sequence, record the * completion and trigger the next step. * * CONTEXT: * spin_lock_irq(fq->mq_flush_lock) */ static void blk_flush_complete_seq(struct request *rq, struct blk_flush_queue *fq, unsigned int seq, blk_status_t error) { struct request_queue *q = rq->q; struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; blk_opf_t cmd_flags; BUG_ON(rq->flush.seq & seq); rq->flush.seq |= seq; cmd_flags = rq->cmd_flags; if (likely(!error)) seq = blk_flush_cur_seq(rq); else seq = REQ_FSEQ_DONE; switch (seq) { case REQ_FSEQ_PREFLUSH: case REQ_FSEQ_POSTFLUSH: /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; list_add_tail(&rq->queuelist, pending); break; case REQ_FSEQ_DATA: fq->flush_data_in_flight++; spin_lock(&q->requeue_lock); list_move(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); break; case REQ_FSEQ_DONE: /* * @rq was previously adjusted by blk_insert_flush() for * flush sequencing and may already have gone through the * flush data request completion path. Restore @rq for * normal completion and end it. */ list_del_init(&rq->queuelist); blk_flush_restore_request(rq); blk_mq_end_request(rq, error); break; default: BUG(); } blk_kick_flush(q, fq, cmd_flags); } static enum rq_end_io_ret flush_end_io(struct request *flush_rq, blk_status_t error) { struct request_queue *q = flush_rq->q; struct list_head *running; struct request *rq, *n; unsigned long flags = 0; struct blk_flush_queue *fq = blk_get_flush_queue(flush_rq->mq_ctx); /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); if (!req_ref_put_and_test(flush_rq)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); return RQ_END_IO_NONE; } blk_account_io_flush(flush_rq); /* * Flush request has to be marked as IDLE when it is really ended * because its .end_io() is called from timeout code path too for * avoiding use-after-free. */ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); if (fq->rq_status != BLK_STS_OK) { error = fq->rq_status; fq->rq_status = BLK_STS_OK; } if (!q->elevator) { flush_rq->tag = BLK_MQ_NO_TAG; } else { blk_mq_put_driver_tag(flush_rq); flush_rq->internal_tag = BLK_MQ_NO_TAG; } running = &fq->flush_queue[fq->flush_running_idx]; BUG_ON(fq->flush_pending_idx == fq->flush_running_idx); /* account completion of the flush request */ fq->flush_running_idx ^= 1; /* and push the waiting requests to the next stage */ list_for_each_entry_safe(rq, n, running, queuelist) { unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); list_del_init(&rq->queuelist); blk_flush_complete_seq(rq, fq, seq, error); } spin_unlock_irqrestore(&fq->mq_flush_lock, flags); return RQ_END_IO_NONE; } bool is_flush_rq(struct request *rq) { return rq->end_io == flush_end_io; } /** * blk_kick_flush - consider issuing flush request * @q: request_queue being kicked * @fq: flush queue * @flags: cmd_flags of the original request * * Flush related states of @q have changed, consider issuing flush request. * Please read the comment at the top of this file for more info. * * CONTEXT: * spin_lock_irq(fq->mq_flush_lock) * */ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_opf_t flags) { struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = list_first_entry(pending, struct request, queuelist); struct request *flush_rq = fq->flush_rq; /* C1 described at the top of this file */ if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) return; /* C2 and C3 */ if (fq->flush_data_in_flight && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return; /* * Issue flush and toggle pending_idx. This makes pending_idx * different from running_idx, which means flush is in flight. */ fq->flush_pending_idx ^= 1; blk_rq_init(q, flush_rq); /* * In case of none scheduler, borrow tag from the first request * since they can't be in flight at the same time. And acquire * the tag's ownership for flush req. * * In case of IO scheduler, flush rq need to borrow scheduler tag * just for cheating put/get driver tag. */ flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_hctx = first_rq->mq_hctx; if (!q->elevator) flush_rq->tag = first_rq->tag; else flush_rq->internal_tag = first_rq->internal_tag; flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->end_io = flush_end_io; /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one * implied in refcount_inc_not_zero() called from * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref * and READ flush_rq->end_io */ smp_wmb(); req_ref_set(flush_rq, 1); spin_lock(&q->requeue_lock); list_add_tail(&flush_rq->queuelist, &q->flush_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); } static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, blk_status_t error) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_mq_ctx *ctx = rq->mq_ctx; unsigned long flags; struct blk_flush_queue *fq = blk_get_flush_queue(ctx); if (q->elevator) { WARN_ON(rq->tag < 0); blk_mq_put_driver_tag(rq); } /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). */ spin_lock_irqsave(&fq->mq_flush_lock, flags); fq->flush_data_in_flight--; /* * May have been corrupted by rq->rq_next reuse, we need to * re-initialize rq->queuelist before reusing it here. */ INIT_LIST_HEAD(&rq->queuelist); blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); blk_mq_sched_restart(hctx); return RQ_END_IO_NONE; } static void blk_rq_init_flush(struct request *rq) { rq->flush.seq = 0; rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ rq->end_io = mq_flush_data_end_io; } /* * Insert a PREFLUSH/FUA request into the flush state machine. * Returns true if the request has been consumed by the flush state machine, * or false if the caller should continue to process it. */ bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; struct blk_flush_queue *fq = blk_get_flush_queue(rq->mq_ctx); bool supports_fua = q->limits.features & BLK_FEAT_FUA; unsigned int policy = 0; /* FLUSH/FUA request must never be merged */ WARN_ON_ONCE(rq->bio != rq->biotail); if (blk_rq_sectors(rq)) policy |= REQ_FSEQ_DATA; /* * Check which flushes we need to sequence for this operation. */ if (blk_queue_write_cache(q)) { if (rq->cmd_flags & REQ_PREFLUSH) policy |= REQ_FSEQ_PREFLUSH; if ((rq->cmd_flags & REQ_FUA) && !supports_fua) policy |= REQ_FSEQ_POSTFLUSH; } /* * @policy now records what operations need to be done. Adjust * REQ_PREFLUSH and FUA for the driver. */ rq->cmd_flags &= ~REQ_PREFLUSH; if (!supports_fua) rq->cmd_flags &= ~REQ_FUA; /* * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any * of those flags, we have to set REQ_SYNC to avoid skewing * the request accounting. */ rq->cmd_flags |= REQ_SYNC; switch (policy) { case 0: /* * An empty flush handed down from a stacking driver may * translate into nothing if the underlying device does not * advertise a write-back cache. In this case, simply * complete the request. */ blk_mq_end_request(rq, 0); return true; case REQ_FSEQ_DATA: /* * If there's data, but no flush is necessary, the request can * be processed directly without going through flush machinery. * Queue for normal execution. */ return false; case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH: /* * Initialize the flush fields and completion handler to trigger * the post flush, and then just pass the command on. */ blk_rq_init_flush(rq); rq->flush.seq |= REQ_FSEQ_PREFLUSH; spin_lock_irq(&fq->mq_flush_lock); fq->flush_data_in_flight++; spin_unlock_irq(&fq->mq_flush_lock); return false; default: /* * Mark the request as part of a flush sequence and submit it * for further processing to the flush state machine. */ blk_rq_init_flush(rq); spin_lock_irq(&fq->mq_flush_lock); blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); spin_unlock_irq(&fq->mq_flush_lock); return true; } } /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for * * Description: * Issue a flush for the block device in question. */ int blkdev_issue_flush(struct block_device *bdev) { struct bio bio; bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH); return submit_bio_wait(&bio); } EXPORT_SYMBOL(blkdev_issue_flush); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, gfp_t flags) { struct blk_flush_queue *fq; int rq_sz = sizeof(struct request); fq = kzalloc_node(sizeof(*fq), flags, node); if (!fq) goto fail; spin_lock_init(&fq->mq_flush_lock); rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); fq->flush_rq = kzalloc_node(rq_sz, flags, node); if (!fq->flush_rq) goto fail_rq; INIT_LIST_HEAD(&fq->flush_queue[0]); INIT_LIST_HEAD(&fq->flush_queue[1]); return fq; fail_rq: kfree(fq); fail: return NULL; } void blk_free_flush_queue(struct blk_flush_queue *fq) { /* bio based request queue hasn't flush queue */ if (!fq) return; kfree(fq->flush_rq); kfree(fq); } /* * Allow driver to set its own lock class to fq->mq_flush_lock for * avoiding lockdep complaint. * * flush_end_io() may be called recursively from some driver, such as * nvme-loop, so lockdep may complain 'possible recursive locking' because * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class * key. We need to assign different lock class for these driver's * fq->mq_flush_lock for avoiding the lockdep warning. * * Use dynamically allocated lock class key for each 'blk_flush_queue' * instance is over-kill, and more worse it introduces horrible boot delay * issue because synchronize_rcu() is implied in lockdep_unregister_key which * is called for each hctx release. SCSI probing may synchronously create and * destroy lots of MQ request_queues for non-existent devices, and some robot * test kernel always enable lockdep option. It is observed that more than half * an hour is taken during SCSI MQ probe with per-fq lock class. */ void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key) { lockdep_set_class(&hctx->fq->mq_flush_lock, key); } EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class);
81 81 81 81 81 70 70 70 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * * This file contains power management functions related to interrupts. */ #include <linux/irq.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> #include "internals.h" void irq_pm_handle_wakeup(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; desc->depth++; irq_disable(desc); pm_system_irq_wakeup(irq_desc_get_irq(desc)); } /* * Called from __setup_irq() with desc->lock held after @action has * been installed in the action chain. */ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { desc->nr_actions++; if (action->flags & IRQF_FORCE_RESUME) desc->force_resume_depth++; WARN_ON_ONCE(desc->force_resume_depth && desc->force_resume_depth != desc->nr_actions); if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth++; else if (action->flags & IRQF_COND_SUSPEND) desc->cond_suspend_depth++; WARN_ON_ONCE(desc->no_suspend_depth && (desc->no_suspend_depth + desc->cond_suspend_depth) != desc->nr_actions); } /* * Called from __free_irq() with desc->lock held after @action has * been removed from the action chain. */ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { desc->nr_actions--; if (action->flags & IRQF_FORCE_RESUME) desc->force_resume_depth--; if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth--; else if (action->flags & IRQF_COND_SUSPEND) desc->cond_suspend_depth--; } static bool suspend_device_irq(struct irq_desc *desc) { unsigned long chipflags = irq_desc_get_chip(desc)->flags; struct irq_data *irqd = &desc->irq_data; if (!desc->action || irq_desc_is_chained(desc) || desc->no_suspend_depth) return false; if (irqd_is_wakeup_set(irqd)) { irqd_set(irqd, IRQD_WAKEUP_ARMED); if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) && irqd_irq_disabled(irqd)) { /* * Interrupt marked for wakeup is in disabled state. * Enable interrupt here to unmask/enable in irqchip * to be able to resume with such interrupts. */ __enable_irq(desc); irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); } /* * We return true here to force the caller to issue * synchronize_irq(). We need to make sure that the * IRQD_WAKEUP_ARMED is visible before we return from * suspend_device_irqs(). */ return true; } desc->istate |= IRQS_SUSPENDED; __disable_irq(desc); /* * Hardware which has no wakeup source configuration facility * requires that the non wakeup interrupts are masked at the * chip level. The chip implementation indicates that with * IRQCHIP_MASK_ON_SUSPEND. */ if (chipflags & IRQCHIP_MASK_ON_SUSPEND) mask_irq(desc); return true; } /** * suspend_device_irqs - disable all currently enabled interrupt lines * * During system-wide suspend or hibernation device drivers need to be * prevented from receiving interrupts and this function is provided * for this purpose. * * So we disable all interrupts and mark them IRQS_SUSPENDED except * for those which are unused, those which are marked as not * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND * set and those which are marked as active wakeup sources. * * The active wakeup sources are handled by the flow handler entry * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the * interrupt and notifies the pm core about the wakeup. */ void suspend_device_irqs(void) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { bool sync; if (irq_settings_is_nested_thread(desc)) continue; scoped_guard(raw_spinlock_irqsave, &desc->lock) sync = suspend_device_irq(desc); if (sync) synchronize_irq(irq); } } static void resume_irq(struct irq_desc *desc) { struct irq_data *irqd = &desc->irq_data; irqd_clear(irqd, IRQD_WAKEUP_ARMED); if (irqd_is_enabled_on_suspend(irqd)) { /* * Interrupt marked for wakeup was enabled during suspend * entry. Disable such interrupts to restore them back to * original state. */ __disable_irq(desc); irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND); } if (desc->istate & IRQS_SUSPENDED) goto resume; /* Force resume the interrupt? */ if (!desc->force_resume_depth) return; /* Pretend that it got disabled ! */ desc->depth++; irq_state_set_disabled(desc); irq_state_set_masked(desc); resume: desc->istate &= ~IRQS_SUSPENDED; __enable_irq(desc); } static void resume_irqs(bool want_early) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { bool is_early = desc->action && desc->action->flags & IRQF_EARLY_RESUME; if (!is_early && want_early) continue; if (irq_settings_is_nested_thread(desc)) continue; guard(raw_spinlock_irqsave)(&desc->lock); resume_irq(desc); } } /** * rearm_wake_irq - rearm a wakeup interrupt line after signaling wakeup * @irq: Interrupt to rearm */ void rearm_wake_irq(unsigned int irq) { scoped_irqdesc_get_and_buslock(irq, IRQ_GET_DESC_CHECK_GLOBAL) { struct irq_desc *desc = scoped_irqdesc; if (!(desc->istate & IRQS_SUSPENDED) || !irqd_is_wakeup_set(&desc->irq_data)) return; desc->istate &= ~IRQS_SUSPENDED; irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); __enable_irq(desc); } } /** * irq_pm_syscore_resume - enable interrupt lines early * @data: syscore context * * Enable all interrupt lines with %IRQF_EARLY_RESUME set. */ static void irq_pm_syscore_resume(void *data) { resume_irqs(true); } static const struct syscore_ops irq_pm_syscore_ops = { .resume = irq_pm_syscore_resume, }; static struct syscore irq_pm_syscore = { .ops = &irq_pm_syscore_ops, }; static int __init irq_pm_init_ops(void) { register_syscore(&irq_pm_syscore); return 0; } device_initcall(irq_pm_init_ops); /** * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() * * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag * set as well as those with %IRQF_FORCE_RESUME. */ void resume_device_irqs(void) { resume_irqs(false); }
53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IF_MACVLAN_H #define _LINUX_IF_MACVLAN_H #include <linux/if_link.h> #include <linux/if_vlan.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <net/netlink.h> #include <linux/u64_stats_sync.h> struct macvlan_port; #define MACVLAN_MC_FILTER_BITS 8 #define MACVLAN_MC_FILTER_SZ (1 << MACVLAN_MC_FILTER_BITS) struct macvlan_dev { struct net_device *dev; struct list_head list; struct hlist_node hlist; struct macvlan_port *port; struct net_device *lowerdev; netdevice_tracker dev_tracker; void *accel_priv; struct vlan_pcpu_stats __percpu *pcpu_stats; DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); netdev_features_t set_features; enum macvlan_mode mode; u16 flags; unsigned int macaddr_count; u32 bc_queue_len_req; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif }; static inline void macvlan_count_rx(const struct macvlan_dev *vlan, unsigned int len, bool success, bool multicast) { if (likely(success)) { struct vlan_pcpu_stats *pcpu_stats; pcpu_stats = get_cpu_ptr(vlan->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->rx_packets); u64_stats_add(&pcpu_stats->rx_bytes, len); if (multicast) u64_stats_inc(&pcpu_stats->rx_multicast); u64_stats_update_end(&pcpu_stats->syncp); put_cpu_ptr(vlan->pcpu_stats); } else { this_cpu_inc(vlan->pcpu_stats->rx_errors); } } extern void macvlan_common_setup(struct net_device *dev); struct rtnl_newlink_params; extern int macvlan_common_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack); extern void macvlan_dellink(struct net_device *dev, struct list_head *head); extern int macvlan_link_register(struct rtnl_link_ops *ops); #if IS_ENABLED(CONFIG_MACVLAN) static inline struct net_device * macvlan_dev_real_dev(const struct net_device *dev) { struct macvlan_dev *macvlan = netdev_priv(dev); return macvlan->lowerdev; } #else static inline struct net_device * macvlan_dev_real_dev(const struct net_device *dev) { BUG(); return NULL; } #endif static inline void *macvlan_accel_priv(struct net_device *dev) { struct macvlan_dev *macvlan = netdev_priv(dev); return macvlan->accel_priv; } static inline bool macvlan_supports_dest_filter(struct net_device *dev) { struct macvlan_dev *macvlan = netdev_priv(dev); return macvlan->mode == MACVLAN_MODE_PRIVATE || macvlan->mode == MACVLAN_MODE_VEPA || macvlan->mode == MACVLAN_MODE_BRIDGE; } static inline int macvlan_release_l2fw_offload(struct net_device *dev) { struct macvlan_dev *macvlan = netdev_priv(dev); macvlan->accel_priv = NULL; return dev_uc_add(macvlan->lowerdev, dev->dev_addr); } #endif /* _LINUX_IF_MACVLAN_H */
33 33 1 32 33 33 105 105 105 100 100 100 100 105 33 172 3 3 3 2 3 30 31 31 31 31 31 30 106 106 106 105 105 105 105 104 1 1 1 1 104 102 99 99 99 98 99 109 21 111 111 33 33 33 33 33 33 33 30 31 31 31 31 2 2 2 2 2 2 26 26 93 93 3 3 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Timer * Copyright (c) 1998-1999 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> */ #include <sound/core.h> #include <linux/slab.h> #include "seq_timer.h" #include "seq_queue.h" #include "seq_info.h" /* allowed sequencer timer frequencies, in Hz */ #define MIN_FREQUENCY 10 #define MAX_FREQUENCY 6250 #define DEFAULT_FREQUENCY 1000 #define SKEW_BASE 0x10000 /* 16bit shift */ static void snd_seq_timer_set_tick_resolution(struct snd_seq_timer *tmr) { unsigned int threshold = tmr->tempo_base == 1000 ? 1000000 : 10000; if (tmr->tempo < threshold) tmr->tick.resolution = (tmr->tempo * tmr->tempo_base) / tmr->ppq; else { /* might overflow.. */ unsigned int s; s = tmr->tempo % tmr->ppq; s = (s * tmr->tempo_base) / tmr->ppq; tmr->tick.resolution = (tmr->tempo / tmr->ppq) * tmr->tempo_base; tmr->tick.resolution += s; } if (tmr->tick.resolution <= 0) tmr->tick.resolution = 1; snd_seq_timer_update_tick(&tmr->tick, 0); } /* create new timer (constructor) */ struct snd_seq_timer *snd_seq_timer_new(void) { struct snd_seq_timer *tmr; tmr = kzalloc(sizeof(*tmr), GFP_KERNEL); if (!tmr) return NULL; spin_lock_init(&tmr->lock); /* reset setup to defaults */ snd_seq_timer_defaults(tmr); /* reset time */ snd_seq_timer_reset(tmr); return tmr; } /* delete timer (destructor) */ void snd_seq_timer_delete(struct snd_seq_timer **tmr) { struct snd_seq_timer *t = *tmr; *tmr = NULL; if (t == NULL) { pr_debug("ALSA: seq: snd_seq_timer_delete() called with NULL timer\n"); return; } t->running = 0; /* reset time */ snd_seq_timer_stop(t); snd_seq_timer_reset(t); kfree(t); } void snd_seq_timer_defaults(struct snd_seq_timer * tmr) { guard(spinlock_irqsave)(&tmr->lock); /* setup defaults */ tmr->ppq = 96; /* 96 PPQ */ tmr->tempo = 500000; /* 120 BPM */ tmr->tempo_base = 1000; /* 1us */ snd_seq_timer_set_tick_resolution(tmr); tmr->running = 0; tmr->type = SNDRV_SEQ_TIMER_ALSA; tmr->alsa_id.dev_class = seq_default_timer_class; tmr->alsa_id.dev_sclass = seq_default_timer_sclass; tmr->alsa_id.card = seq_default_timer_card; tmr->alsa_id.device = seq_default_timer_device; tmr->alsa_id.subdevice = seq_default_timer_subdevice; tmr->preferred_resolution = seq_default_timer_resolution; tmr->skew = tmr->skew_base = SKEW_BASE; } static void seq_timer_reset(struct snd_seq_timer *tmr) { /* reset time & songposition */ tmr->cur_time.tv_sec = 0; tmr->cur_time.tv_nsec = 0; tmr->tick.cur_tick = 0; tmr->tick.fraction = 0; } void snd_seq_timer_reset(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); seq_timer_reset(tmr); } /* called by timer interrupt routine. the period time since previous invocation is passed */ static void snd_seq_timer_interrupt(struct snd_timer_instance *timeri, unsigned long resolution, unsigned long ticks) { struct snd_seq_queue *q = timeri->callback_data; struct snd_seq_timer *tmr; if (q == NULL) return; tmr = q->timer; if (tmr == NULL) return; scoped_guard(spinlock_irqsave, &tmr->lock) { if (!tmr->running) return; resolution *= ticks; if (tmr->skew != tmr->skew_base) { /* FIXME: assuming skew_base = 0x10000 */ resolution = (resolution >> 16) * tmr->skew + (((resolution & 0xffff) * tmr->skew) >> 16); } /* update timer */ snd_seq_inc_time_nsec(&tmr->cur_time, resolution); /* calculate current tick */ snd_seq_timer_update_tick(&tmr->tick, resolution); /* register actual time of this timer update */ ktime_get_ts64(&tmr->last_update); } /* check queues and dispatch events */ snd_seq_check_queue(q, 1, 0); } /* set current tempo */ int snd_seq_timer_set_tempo(struct snd_seq_timer * tmr, int tempo) { if (snd_BUG_ON(!tmr)) return -EINVAL; if (tempo <= 0) return -EINVAL; guard(spinlock_irqsave)(&tmr->lock); if ((unsigned int)tempo != tmr->tempo) { tmr->tempo = tempo; snd_seq_timer_set_tick_resolution(tmr); } return 0; } /* set current tempo, ppq and base in a shot */ int snd_seq_timer_set_tempo_ppq(struct snd_seq_timer *tmr, int tempo, int ppq, unsigned int tempo_base) { int changed; if (snd_BUG_ON(!tmr)) return -EINVAL; if (tempo <= 0 || ppq <= 0) return -EINVAL; /* allow only 10ns or 1us tempo base for now */ if (tempo_base && tempo_base != 10 && tempo_base != 1000) return -EINVAL; guard(spinlock_irqsave)(&tmr->lock); if (tmr->running && (ppq != tmr->ppq)) { /* refuse to change ppq on running timers */ /* because it will upset the song position (ticks) */ pr_debug("ALSA: seq: cannot change ppq of a running timer\n"); return -EBUSY; } changed = (tempo != tmr->tempo) || (ppq != tmr->ppq); tmr->tempo = tempo; tmr->ppq = ppq; tmr->tempo_base = tempo_base ? tempo_base : 1000; if (changed) snd_seq_timer_set_tick_resolution(tmr); return 0; } /* set current tick position */ int snd_seq_timer_set_position_tick(struct snd_seq_timer *tmr, snd_seq_tick_time_t position) { if (snd_BUG_ON(!tmr)) return -EINVAL; guard(spinlock_irqsave)(&tmr->lock); tmr->tick.cur_tick = position; tmr->tick.fraction = 0; return 0; } /* set current real-time position */ int snd_seq_timer_set_position_time(struct snd_seq_timer *tmr, snd_seq_real_time_t position) { if (snd_BUG_ON(!tmr)) return -EINVAL; snd_seq_sanity_real_time(&position); guard(spinlock_irqsave)(&tmr->lock); tmr->cur_time = position; return 0; } /* set timer skew */ int snd_seq_timer_set_skew(struct snd_seq_timer *tmr, unsigned int skew, unsigned int base) { if (snd_BUG_ON(!tmr)) return -EINVAL; /* FIXME */ if (base != SKEW_BASE) { pr_debug("ALSA: seq: invalid skew base 0x%x\n", base); return -EINVAL; } guard(spinlock_irqsave)(&tmr->lock); tmr->skew = skew; return 0; } int snd_seq_timer_open(struct snd_seq_queue *q) { struct snd_timer_instance *t; struct snd_seq_timer *tmr; char str[32]; int err; tmr = q->timer; if (snd_BUG_ON(!tmr)) return -EINVAL; if (tmr->timeri) return -EBUSY; sprintf(str, "sequencer queue %i", q->queue); if (tmr->type != SNDRV_SEQ_TIMER_ALSA) /* standard ALSA timer */ return -EINVAL; if (tmr->alsa_id.dev_class != SNDRV_TIMER_CLASS_SLAVE) tmr->alsa_id.dev_sclass = SNDRV_TIMER_SCLASS_SEQUENCER; t = snd_timer_instance_new(str); if (!t) return -ENOMEM; t->callback = snd_seq_timer_interrupt; t->callback_data = q; t->flags |= SNDRV_TIMER_IFLG_AUTO; err = snd_timer_open(t, &tmr->alsa_id, q->queue); if (err < 0 && tmr->alsa_id.dev_class != SNDRV_TIMER_CLASS_SLAVE) { if (tmr->alsa_id.dev_class != SNDRV_TIMER_CLASS_GLOBAL || tmr->alsa_id.device != SNDRV_TIMER_GLOBAL_SYSTEM) { struct snd_timer_id tid; memset(&tid, 0, sizeof(tid)); tid.dev_class = SNDRV_TIMER_CLASS_GLOBAL; tid.dev_sclass = SNDRV_TIMER_SCLASS_SEQUENCER; tid.card = -1; tid.device = SNDRV_TIMER_GLOBAL_SYSTEM; err = snd_timer_open(t, &tid, q->queue); } } if (err < 0) { pr_err("ALSA: seq fatal error: cannot create timer (%i)\n", err); snd_timer_instance_free(t); return err; } scoped_guard(spinlock_irq, &tmr->lock) { if (tmr->timeri) err = -EBUSY; else tmr->timeri = t; } if (err < 0) { snd_timer_close(t); snd_timer_instance_free(t); return err; } return 0; } int snd_seq_timer_close(struct snd_seq_queue *q) { struct snd_seq_timer *tmr; struct snd_timer_instance *t; tmr = q->timer; if (snd_BUG_ON(!tmr)) return -EINVAL; scoped_guard(spinlock_irq, &tmr->lock) { t = tmr->timeri; tmr->timeri = NULL; } if (t) { snd_timer_close(t); snd_timer_instance_free(t); } return 0; } static int seq_timer_stop(struct snd_seq_timer *tmr) { if (! tmr->timeri) return -EINVAL; if (!tmr->running) return 0; tmr->running = 0; snd_timer_pause(tmr->timeri); return 0; } int snd_seq_timer_stop(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return seq_timer_stop(tmr); } static int initialize_timer(struct snd_seq_timer *tmr) { struct snd_timer *t; unsigned long freq; t = tmr->timeri->timer; if (!t) return -EINVAL; freq = tmr->preferred_resolution; if (!freq) freq = DEFAULT_FREQUENCY; else if (freq < MIN_FREQUENCY) freq = MIN_FREQUENCY; else if (freq > MAX_FREQUENCY) freq = MAX_FREQUENCY; tmr->ticks = 1; if (!(t->hw.flags & SNDRV_TIMER_HW_SLAVE)) { unsigned long r = snd_timer_resolution(tmr->timeri); if (r) { tmr->ticks = (unsigned int)(1000000000uL / (r * freq)); if (! tmr->ticks) tmr->ticks = 1; } } tmr->initialized = 1; return 0; } static int seq_timer_start(struct snd_seq_timer *tmr) { if (! tmr->timeri) return -EINVAL; if (tmr->running) seq_timer_stop(tmr); seq_timer_reset(tmr); if (initialize_timer(tmr) < 0) return -EINVAL; snd_timer_start(tmr->timeri, tmr->ticks); tmr->running = 1; ktime_get_ts64(&tmr->last_update); return 0; } int snd_seq_timer_start(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return seq_timer_start(tmr); } static int seq_timer_continue(struct snd_seq_timer *tmr) { if (! tmr->timeri) return -EINVAL; if (tmr->running) return -EBUSY; if (! tmr->initialized) { seq_timer_reset(tmr); if (initialize_timer(tmr) < 0) return -EINVAL; } snd_timer_start(tmr->timeri, tmr->ticks); tmr->running = 1; ktime_get_ts64(&tmr->last_update); return 0; } int snd_seq_timer_continue(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return seq_timer_continue(tmr); } /* return current 'real' time. use timeofday() to get better granularity. */ snd_seq_real_time_t snd_seq_timer_get_cur_time(struct snd_seq_timer *tmr, bool adjust_ktime) { snd_seq_real_time_t cur_time; guard(spinlock_irqsave)(&tmr->lock); cur_time = tmr->cur_time; if (adjust_ktime && tmr->running) { struct timespec64 tm; ktime_get_ts64(&tm); tm = timespec64_sub(tm, tmr->last_update); cur_time.tv_nsec += tm.tv_nsec; cur_time.tv_sec += tm.tv_sec; snd_seq_sanity_real_time(&cur_time); } return cur_time; } /* TODO: use interpolation on tick queue (will only be useful for very high PPQ values) */ snd_seq_tick_time_t snd_seq_timer_get_cur_tick(struct snd_seq_timer *tmr) { guard(spinlock_irqsave)(&tmr->lock); return tmr->tick.cur_tick; } #ifdef CONFIG_SND_PROC_FS /* exported to seq_info.c */ void snd_seq_info_timer_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int idx; struct snd_seq_timer *tmr; struct snd_timer_instance *ti; unsigned long resolution; for (idx = 0; idx < SNDRV_SEQ_MAX_QUEUES; idx++) { struct snd_seq_queue *q __free(snd_seq_queue) = queueptr(idx); if (q == NULL) continue; scoped_guard(mutex, &q->timer_mutex) { tmr = q->timer; if (!tmr) break; ti = tmr->timeri; if (!ti) break; snd_iprintf(buffer, "Timer for queue %i : %s\n", q->queue, ti->timer->name); resolution = snd_timer_resolution(ti) * tmr->ticks; snd_iprintf(buffer, " Period time : %lu.%09lu\n", resolution / 1000000000, resolution % 1000000000); snd_iprintf(buffer, " Skew : %u / %u\n", tmr->skew, tmr->skew_base); } } } #endif /* CONFIG_SND_PROC_FS */
5 5 2 5 5 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 // SPDX-License-Identifier: GPL-2.0 #include <linux/cpumask.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/sched/stat.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/time_namespace.h> #include <linux/irqnr.h> #include <linux/sched/cputime.h> #include <linux/tick.h> #ifndef arch_irq_stat_cpu #define arch_irq_stat_cpu(cpu) 0 #endif #ifndef arch_irq_stat #define arch_irq_stat() 0 #endif u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; if (cpu_online(cpu)) idle_usecs = get_cpu_idle_time_us(cpu, NULL); if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ idle = kcs->cpustat[CPUTIME_IDLE]; else idle = idle_usecs * NSEC_PER_USEC; return idle; } static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait, iowait_usecs = -1ULL; if (cpu_online(cpu)) iowait_usecs = get_cpu_iowait_time_us(cpu, NULL); if (iowait_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ iowait = kcs->cpustat[CPUTIME_IOWAIT]; else iowait = iowait_usecs * NSEC_PER_USEC; return iowait; } static void show_irq_gap(struct seq_file *p, unsigned int gap) { static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"; while (gap > 0) { unsigned int inc; inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2); seq_write(p, zeros, 2 * inc); gap -= inc; } } static void show_all_irqs(struct seq_file *p) { unsigned int i, next = 0; for_each_active_irq(i) { show_irq_gap(p, i - next); seq_put_decimal_ull(p, " ", kstat_irqs_usr(i)); next = i + 1; } show_irq_gap(p, irq_get_nr_irqs() - next); } static int show_stat(struct seq_file *p, void *v) { int i, j; u64 user, nice, system, idle, iowait, irq, softirq, steal; u64 guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec64 boottime; user = nice = system = idle = iowait = irq = softirq = steal = 0; guest = guest_nice = 0; getboottime64(&boottime); /* shift boot timestamp according to the timens offset */ timens_sub_boottime(&boottime); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; kcpustat_cpu_fetch(&kcpustat, i); user += cpustat[CPUTIME_USER]; nice += cpustat[CPUTIME_NICE]; system += cpustat[CPUTIME_SYSTEM]; idle += get_idle_time(&kcpustat, i); iowait += get_iowait_time(&kcpustat, i); irq += cpustat[CPUTIME_IRQ]; softirq += cpustat[CPUTIME_SOFTIRQ]; steal += cpustat[CPUTIME_STEAL]; guest += cpustat[CPUTIME_GUEST]; guest_nice += cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); per_softirq_sums[j] += softirq_stat; sum_softirq += softirq_stat; } } sum += arch_irq_stat(); seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); for_each_online_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; kcpustat_cpu_fetch(&kcpustat, i); /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ user = cpustat[CPUTIME_USER]; nice = cpustat[CPUTIME_NICE]; system = cpustat[CPUTIME_SYSTEM]; idle = get_idle_time(&kcpustat, i); iowait = get_iowait_time(&kcpustat, i); irq = cpustat[CPUTIME_IRQ]; softirq = cpustat[CPUTIME_SOFTIRQ]; steal = cpustat[CPUTIME_STEAL]; guest = cpustat[CPUTIME_GUEST]; guest_nice = cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); } seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); show_all_irqs(p); seq_printf(p, "\nctxt %llu\n" "btime %llu\n" "processes %lu\n" "procs_running %u\n" "procs_blocked %u\n", nr_context_switches(), (unsigned long long)boottime.tv_sec, total_forks, nr_running(), nr_iowait()); seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq); for (i = 0; i < NR_SOFTIRQS; i++) seq_put_decimal_ull(p, " ", per_softirq_sums[i]); seq_putc(p, '\n'); return 0; } static int stat_open(struct inode *inode, struct file *file) { unsigned int size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * irq_get_nr_irqs(); return single_open_size(file, show_stat, NULL, size); } static const struct proc_ops stat_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = stat_open, .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = single_release, }; static int __init proc_stat_init(void) { proc_create("stat", 0, NULL, &stat_proc_ops); return 0; } fs_initcall(proc_stat_init);
65 30 30 30 30 30 30 30 30 14 30 40 40 40 40 40 40 40 40 24 40 40 40 39 37 40 16 40 40 40 40 40 31 30 30 30 1 30 28 30 30 30 31 31 31 31 30 30 31 2 2 2 39 40 40 40 39 39 39 117 7 4 4 4 4 4 4 3 3 4 1 5 5 5 5 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 /* * \author Rickard E. (Rik) Faith <faith@valinux.com> * \author Daryll Strauss <daryll@valinux.com> * \author Gareth Hughes <gareth@valinux.com> */ /* * Created: Mon Jan 4 08:58:31 1999 by faith@valinux.com * * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/anon_inodes.h> #include <linux/dma-fence.h> #include <linux/export.h> #include <linux/file.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/vga_switcheroo.h> #include <drm/drm_client_event.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_gem.h> #include <drm/drm_print.h> #include <drm/drm_debugfs.h> #include "drm_crtc_internal.h" #include "drm_internal.h" /* from BKL pushdown */ DEFINE_MUTEX(drm_global_mutex); bool drm_dev_needs_global_mutex(struct drm_device *dev) { /* * The deprecated ->load callback must be called after the driver is * already registered. This means such drivers rely on the BKL to make * sure an open can't proceed until the driver is actually fully set up. * Similar hilarity holds for the unload callback. */ if (dev->driver->load || dev->driver->unload) return true; return false; } /** * DOC: file operations * * Drivers must define the file operations structure that forms the DRM * userspace API entry point, even though most of those operations are * implemented in the DRM core. The resulting &struct file_operations must be * stored in the &drm_driver.fops field. The mandatory functions are drm_open(), * drm_read(), drm_ioctl() and drm_compat_ioctl() if CONFIG_COMPAT is enabled * Note that drm_compat_ioctl will be NULL if CONFIG_COMPAT=n, so there's no * need to sprinkle #ifdef into the code. Drivers which implement private ioctls * that require 32/64 bit compatibility support must provide their own * &file_operations.compat_ioctl handler that processes private ioctls and calls * drm_compat_ioctl() for core ioctls. * * In addition drm_read() and drm_poll() provide support for DRM events. DRM * events are a generic and extensible means to send asynchronous events to * userspace through the file descriptor. They are used to send vblank event and * page flip completions by the KMS API. But drivers can also use it for their * own needs, e.g. to signal completion of rendering. * * For the driver-side event interface see drm_event_reserve_init() and * drm_send_event() as the main starting points. * * The memory mapping implementation will vary depending on how the driver * manages memory. For GEM-based drivers this is drm_gem_mmap(). * * No other file operations are supported by the DRM userspace API. Overall the * following is an example &file_operations structure:: * * static const example_drm_fops = { * .owner = THIS_MODULE, * .open = drm_open, * .release = drm_release, * .unlocked_ioctl = drm_ioctl, * .compat_ioctl = drm_compat_ioctl, // NULL if CONFIG_COMPAT=n * .poll = drm_poll, * .read = drm_read, * .mmap = drm_gem_mmap, * }; * * For plain GEM based drivers there is the DEFINE_DRM_GEM_FOPS() macro, and for * DMA based drivers there is the DEFINE_DRM_GEM_DMA_FOPS() macro to make this * simpler. * * The driver's &file_operations must be stored in &drm_driver.fops. * * For driver-private IOCTL handling see the more detailed discussion in * :ref:`IOCTL support in the userland interfaces chapter<drm_driver_ioctl>`. */ /** * drm_file_alloc - allocate file context * @minor: minor to allocate on * * This allocates a new DRM file context. It is not linked into any context and * can be used by the caller freely. Note that the context keeps a pointer to * @minor, so it must be freed before @minor is. * * RETURNS: * Pointer to newly allocated context, ERR_PTR on failure. */ struct drm_file *drm_file_alloc(struct drm_minor *minor) { static atomic64_t ident = ATOMIC64_INIT(0); struct drm_device *dev = minor->dev; struct drm_file *file; int ret; file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) return ERR_PTR(-ENOMEM); /* Get a unique identifier for fdinfo: */ file->client_id = atomic64_inc_return(&ident); rcu_assign_pointer(file->pid, get_pid(task_tgid(current))); file->minor = minor; /* for compatibility root is always authenticated */ file->authenticated = capable(CAP_SYS_ADMIN); INIT_LIST_HEAD(&file->lhead); INIT_LIST_HEAD(&file->fbs); mutex_init(&file->fbs_lock); INIT_LIST_HEAD(&file->blobs); INIT_LIST_HEAD(&file->pending_event_list); INIT_LIST_HEAD(&file->event_list); init_waitqueue_head(&file->event_wait); file->event_space = 4096; /* set aside 4k for event buffer */ spin_lock_init(&file->master_lookup_lock); mutex_init(&file->event_read_lock); mutex_init(&file->client_name_lock); if (drm_core_check_feature(dev, DRIVER_GEM)) drm_gem_open(dev, file); if (drm_core_check_feature(dev, DRIVER_SYNCOBJ)) drm_syncobj_open(file); drm_prime_init_file_private(&file->prime); if (!drm_core_check_feature(dev, DRIVER_COMPUTE_ACCEL)) drm_debugfs_clients_add(file); if (dev->driver->open) { ret = dev->driver->open(dev, file); if (ret < 0) goto out_prime_destroy; } return file; out_prime_destroy: drm_prime_destroy_file_private(&file->prime); if (drm_core_check_feature(dev, DRIVER_SYNCOBJ)) drm_syncobj_release(file); if (drm_core_check_feature(dev, DRIVER_GEM)) drm_gem_release(dev, file); if (!drm_core_check_feature(dev, DRIVER_COMPUTE_ACCEL)) drm_debugfs_clients_remove(file); put_pid(rcu_access_pointer(file->pid)); kfree(file); return ERR_PTR(ret); } static void drm_events_release(struct drm_file *file_priv) { struct drm_device *dev = file_priv->minor->dev; struct drm_pending_event *e, *et; unsigned long flags; spin_lock_irqsave(&dev->event_lock, flags); /* Unlink pending events */ list_for_each_entry_safe(e, et, &file_priv->pending_event_list, pending_link) { list_del(&e->pending_link); e->file_priv = NULL; } /* Remove unconsumed events */ list_for_each_entry_safe(e, et, &file_priv->event_list, link) { list_del(&e->link); kfree(e); } spin_unlock_irqrestore(&dev->event_lock, flags); } /** * drm_file_free - free file context * @file: context to free, or NULL * * This destroys and deallocates a DRM file context previously allocated via * drm_file_alloc(). The caller must make sure to unlink it from any contexts * before calling this. * * If NULL is passed, this is a no-op. */ void drm_file_free(struct drm_file *file) { struct drm_device *dev; if (!file) return; dev = file->minor->dev; drm_dbg_core(dev, "comm=\"%s\", pid=%d, dev=0x%lx, open_count=%d\n", current->comm, task_pid_nr(current), (long)old_encode_dev(file->minor->kdev->devt), atomic_read(&dev->open_count)); if (!drm_core_check_feature(dev, DRIVER_COMPUTE_ACCEL)) drm_debugfs_clients_remove(file); drm_events_release(file); if (drm_core_check_feature(dev, DRIVER_MODESET)) { drm_fb_release(file); drm_property_destroy_user_blobs(dev, file); } if (drm_core_check_feature(dev, DRIVER_SYNCOBJ)) drm_syncobj_release(file); if (drm_core_check_feature(dev, DRIVER_GEM)) drm_gem_release(dev, file); if (drm_is_primary_client(file)) drm_master_release(file); if (dev->driver->postclose) dev->driver->postclose(dev, file); drm_prime_destroy_file_private(&file->prime); WARN_ON(!list_empty(&file->event_list)); put_pid(rcu_access_pointer(file->pid)); mutex_destroy(&file->client_name_lock); kfree(file->client_name); kfree(file); } static void drm_close_helper(struct file *filp) { struct drm_file *file_priv = filp->private_data; struct drm_device *dev = file_priv->minor->dev; mutex_lock(&dev->filelist_mutex); list_del(&file_priv->lhead); mutex_unlock(&dev->filelist_mutex); drm_file_free(file_priv); } /* * Check whether DRI will run on this CPU. * * \return non-zero if the DRI will run on this CPU, or zero otherwise. */ static int drm_cpu_valid(void) { #if defined(__sparc__) && !defined(__sparc_v9__) return 0; /* No cmpxchg before v9 sparc. */ #endif return 1; } /* * Called whenever a process opens a drm node * * \param filp file pointer. * \param minor acquired minor-object. * \return zero on success or a negative number on failure. * * Creates and initializes a drm_file structure for the file private data in \p * filp and add it into the double linked list in \p dev. */ int drm_open_helper(struct file *filp, struct drm_minor *minor) { struct drm_device *dev = minor->dev; struct drm_file *priv; int ret; if (filp->f_flags & O_EXCL) return -EBUSY; /* No exclusive opens */ if (!drm_cpu_valid()) return -EINVAL; if (dev->switch_power_state != DRM_SWITCH_POWER_ON && dev->switch_power_state != DRM_SWITCH_POWER_DYNAMIC_OFF) return -EINVAL; if (WARN_ON_ONCE(!(filp->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) return -EINVAL; drm_dbg_core(dev, "comm=\"%s\", pid=%d, minor=%d\n", current->comm, task_pid_nr(current), minor->index); priv = drm_file_alloc(minor); if (IS_ERR(priv)) return PTR_ERR(priv); if (drm_is_primary_client(priv)) { ret = drm_master_open(priv); if (ret) { drm_file_free(priv); return ret; } } filp->private_data = priv; priv->filp = filp; mutex_lock(&dev->filelist_mutex); list_add(&priv->lhead, &dev->filelist); mutex_unlock(&dev->filelist_mutex); return 0; } /** * drm_open - open method for DRM file * @inode: device inode * @filp: file pointer. * * This function must be used by drivers as their &file_operations.open method. * It looks up the correct DRM device and instantiates all the per-file * resources for it. It also calls the &drm_driver.open driver callback. * * RETURNS: * 0 on success or negative errno value on failure. */ int drm_open(struct inode *inode, struct file *filp) { struct drm_device *dev; struct drm_minor *minor; int retcode; minor = drm_minor_acquire(&drm_minors_xa, iminor(inode)); if (IS_ERR(minor)) return PTR_ERR(minor); dev = minor->dev; if (drm_dev_needs_global_mutex(dev)) mutex_lock(&drm_global_mutex); atomic_fetch_inc(&dev->open_count); /* share address_space across all char-devs of a single device */ filp->f_mapping = dev->anon_inode->i_mapping; retcode = drm_open_helper(filp, minor); if (retcode) goto err_undo; if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); return 0; err_undo: atomic_dec(&dev->open_count); if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); drm_minor_release(minor); return retcode; } EXPORT_SYMBOL(drm_open); static void drm_lastclose(struct drm_device *dev) { drm_client_dev_restore(dev, false); if (dev_is_pci(dev->dev)) vga_switcheroo_process_delayed_switch(); } /** * drm_release - release method for DRM file * @inode: device inode * @filp: file pointer. * * This function must be used by drivers as their &file_operations.release * method. It frees any resources associated with the open file. If this * is the last open file for the DRM device, it also restores the active * in-kernel DRM client. * * RETURNS: * Always succeeds and returns 0. */ int drm_release(struct inode *inode, struct file *filp) { struct drm_file *file_priv = filp->private_data; struct drm_minor *minor = file_priv->minor; struct drm_device *dev = minor->dev; if (drm_dev_needs_global_mutex(dev)) mutex_lock(&drm_global_mutex); drm_dbg_core(dev, "open_count = %d\n", atomic_read(&dev->open_count)); drm_close_helper(filp); if (atomic_dec_and_test(&dev->open_count)) drm_lastclose(dev); if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); drm_minor_release(minor); return 0; } EXPORT_SYMBOL(drm_release); void drm_file_update_pid(struct drm_file *filp) { struct drm_device *dev; struct pid *pid, *old; /* * Master nodes need to keep the original ownership in order for * drm_master_check_perm to keep working correctly. (See comment in * drm_auth.c.) */ if (filp->was_master) return; pid = task_tgid(current); /* * Quick unlocked check since the model is a single handover followed by * exclusive repeated use. */ if (pid == rcu_access_pointer(filp->pid)) return; dev = filp->minor->dev; mutex_lock(&dev->filelist_mutex); get_pid(pid); old = rcu_replace_pointer(filp->pid, pid, 1); mutex_unlock(&dev->filelist_mutex); synchronize_rcu(); put_pid(old); } /** * drm_release_noglobal - release method for DRM file * @inode: device inode * @filp: file pointer. * * This function may be used by drivers as their &file_operations.release * method. It frees any resources associated with the open file prior to taking * the drm_global_mutex. If this is the last open file for the DRM device, it * then restores the active in-kernel DRM client. * * RETURNS: * Always succeeds and returns 0. */ int drm_release_noglobal(struct inode *inode, struct file *filp) { struct drm_file *file_priv = filp->private_data; struct drm_minor *minor = file_priv->minor; struct drm_device *dev = minor->dev; drm_close_helper(filp); if (atomic_dec_and_mutex_lock(&dev->open_count, &drm_global_mutex)) { drm_lastclose(dev); mutex_unlock(&drm_global_mutex); } drm_minor_release(minor); return 0; } EXPORT_SYMBOL(drm_release_noglobal); /** * drm_read - read method for DRM file * @filp: file pointer * @buffer: userspace destination pointer for the read * @count: count in bytes to read * @offset: offset to read * * This function must be used by drivers as their &file_operations.read * method if they use DRM events for asynchronous signalling to userspace. * Since events are used by the KMS API for vblank and page flip completion this * means all modern display drivers must use it. * * @offset is ignored, DRM events are read like a pipe. Polling support is * provided by drm_poll(). * * This function will only ever read a full event. Therefore userspace must * supply a big enough buffer to fit any event to ensure forward progress. Since * the maximum event space is currently 4K it's recommended to just use that for * safety. * * RETURNS: * Number of bytes read (always aligned to full events, and can be 0) or a * negative error code on failure. */ ssize_t drm_read(struct file *filp, char __user *buffer, size_t count, loff_t *offset) { struct drm_file *file_priv = filp->private_data; struct drm_device *dev = file_priv->minor->dev; ssize_t ret; ret = mutex_lock_interruptible(&file_priv->event_read_lock); if (ret) return ret; for (;;) { struct drm_pending_event *e = NULL; spin_lock_irq(&dev->event_lock); if (!list_empty(&file_priv->event_list)) { e = list_first_entry(&file_priv->event_list, struct drm_pending_event, link); file_priv->event_space += e->event->length; list_del(&e->link); } spin_unlock_irq(&dev->event_lock); if (e == NULL) { if (ret) break; if (filp->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } mutex_unlock(&file_priv->event_read_lock); ret = wait_event_interruptible(file_priv->event_wait, !list_empty(&file_priv->event_list)); if (ret >= 0) ret = mutex_lock_interruptible(&file_priv->event_read_lock); if (ret) return ret; } else { unsigned length = e->event->length; if (length > count - ret) { put_back_event: spin_lock_irq(&dev->event_lock); file_priv->event_space -= length; list_add(&e->link, &file_priv->event_list); spin_unlock_irq(&dev->event_lock); wake_up_interruptible_poll(&file_priv->event_wait, EPOLLIN | EPOLLRDNORM); break; } if (copy_to_user(buffer + ret, e->event, length)) { if (ret == 0) ret = -EFAULT; goto put_back_event; } ret += length; kfree(e); } } mutex_unlock(&file_priv->event_read_lock); return ret; } EXPORT_SYMBOL(drm_read); /** * drm_poll - poll method for DRM file * @filp: file pointer * @wait: poll waiter table * * This function must be used by drivers as their &file_operations.read method * if they use DRM events for asynchronous signalling to userspace. Since * events are used by the KMS API for vblank and page flip completion this means * all modern display drivers must use it. * * See also drm_read(). * * RETURNS: * Mask of POLL flags indicating the current status of the file. */ __poll_t drm_poll(struct file *filp, struct poll_table_struct *wait) { struct drm_file *file_priv = filp->private_data; __poll_t mask = 0; poll_wait(filp, &file_priv->event_wait, wait); if (!list_empty(&file_priv->event_list)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } EXPORT_SYMBOL(drm_poll); /** * drm_event_reserve_init_locked - init a DRM event and reserve space for it * @dev: DRM device * @file_priv: DRM file private data * @p: tracking structure for the pending event * @e: actual event data to deliver to userspace * * This function prepares the passed in event for eventual delivery. If the event * doesn't get delivered (because the IOCTL fails later on, before queuing up * anything) then the even must be cancelled and freed using * drm_event_cancel_free(). Successfully initialized events should be sent out * using drm_send_event() or drm_send_event_locked() to signal completion of the * asynchronous event to userspace. * * If callers embedded @p into a larger structure it must be allocated with * kmalloc and @p must be the first member element. * * This is the locked version of drm_event_reserve_init() for callers which * already hold &drm_device.event_lock. * * RETURNS: * 0 on success or a negative error code on failure. */ int drm_event_reserve_init_locked(struct drm_device *dev, struct drm_file *file_priv, struct drm_pending_event *p, struct drm_event *e) { if (file_priv->event_space < e->length) return -ENOMEM; file_priv->event_space -= e->length; p->event = e; list_add(&p->pending_link, &file_priv->pending_event_list); p->file_priv = file_priv; return 0; } EXPORT_SYMBOL(drm_event_reserve_init_locked); /** * drm_event_reserve_init - init a DRM event and reserve space for it * @dev: DRM device * @file_priv: DRM file private data * @p: tracking structure for the pending event * @e: actual event data to deliver to userspace * * This function prepares the passed in event for eventual delivery. If the event * doesn't get delivered (because the IOCTL fails later on, before queuing up * anything) then the even must be cancelled and freed using * drm_event_cancel_free(). Successfully initialized events should be sent out * using drm_send_event() or drm_send_event_locked() to signal completion of the * asynchronous event to userspace. * * If callers embedded @p into a larger structure it must be allocated with * kmalloc and @p must be the first member element. * * Callers which already hold &drm_device.event_lock should use * drm_event_reserve_init_locked() instead. * * RETURNS: * 0 on success or a negative error code on failure. */ int drm_event_reserve_init(struct drm_device *dev, struct drm_file *file_priv, struct drm_pending_event *p, struct drm_event *e) { unsigned long flags; int ret; spin_lock_irqsave(&dev->event_lock, flags); ret = drm_event_reserve_init_locked(dev, file_priv, p, e); spin_unlock_irqrestore(&dev->event_lock, flags); return ret; } EXPORT_SYMBOL(drm_event_reserve_init); /** * drm_event_cancel_free - free a DRM event and release its space * @dev: DRM device * @p: tracking structure for the pending event * * This function frees the event @p initialized with drm_event_reserve_init() * and releases any allocated space. It is used to cancel an event when the * nonblocking operation could not be submitted and needed to be aborted. */ void drm_event_cancel_free(struct drm_device *dev, struct drm_pending_event *p) { unsigned long flags; spin_lock_irqsave(&dev->event_lock, flags); if (p->file_priv) { p->file_priv->event_space += p->event->length; list_del(&p->pending_link); } spin_unlock_irqrestore(&dev->event_lock, flags); if (p->fence) dma_fence_put(p->fence); kfree(p); } EXPORT_SYMBOL(drm_event_cancel_free); static void drm_send_event_helper(struct drm_device *dev, struct drm_pending_event *e, ktime_t timestamp) { assert_spin_locked(&dev->event_lock); if (e->completion) { complete_all(e->completion); e->completion_release(e->completion); e->completion = NULL; } if (e->fence) { if (timestamp) dma_fence_signal_timestamp(e->fence, timestamp); else dma_fence_signal(e->fence); dma_fence_put(e->fence); } if (!e->file_priv) { kfree(e); return; } list_del(&e->pending_link); list_add_tail(&e->link, &e->file_priv->event_list); wake_up_interruptible_poll(&e->file_priv->event_wait, EPOLLIN | EPOLLRDNORM); } /** * drm_send_event_timestamp_locked - send DRM event to file descriptor * @dev: DRM device * @e: DRM event to deliver * @timestamp: timestamp to set for the fence event in kernel's CLOCK_MONOTONIC * time domain * * This function sends the event @e, initialized with drm_event_reserve_init(), * to its associated userspace DRM file. Callers must already hold * &drm_device.event_lock. * * Note that the core will take care of unlinking and disarming events when the * corresponding DRM file is closed. Drivers need not worry about whether the * DRM file for this event still exists and can call this function upon * completion of the asynchronous work unconditionally. */ void drm_send_event_timestamp_locked(struct drm_device *dev, struct drm_pending_event *e, ktime_t timestamp) { drm_send_event_helper(dev, e, timestamp); } EXPORT_SYMBOL(drm_send_event_timestamp_locked); /** * drm_send_event_locked - send DRM event to file descriptor * @dev: DRM device * @e: DRM event to deliver * * This function sends the event @e, initialized with drm_event_reserve_init(), * to its associated userspace DRM file. Callers must already hold * &drm_device.event_lock, see drm_send_event() for the unlocked version. * * Note that the core will take care of unlinking and disarming events when the * corresponding DRM file is closed. Drivers need not worry about whether the * DRM file for this event still exists and can call this function upon * completion of the asynchronous work unconditionally. */ void drm_send_event_locked(struct drm_device *dev, struct drm_pending_event *e) { drm_send_event_helper(dev, e, 0); } EXPORT_SYMBOL(drm_send_event_locked); /** * drm_send_event - send DRM event to file descriptor * @dev: DRM device * @e: DRM event to deliver * * This function sends the event @e, initialized with drm_event_reserve_init(), * to its associated userspace DRM file. This function acquires * &drm_device.event_lock, see drm_send_event_locked() for callers which already * hold this lock. * * Note that the core will take care of unlinking and disarming events when the * corresponding DRM file is closed. Drivers need not worry about whether the * DRM file for this event still exists and can call this function upon * completion of the asynchronous work unconditionally. */ void drm_send_event(struct drm_device *dev, struct drm_pending_event *e) { unsigned long irqflags; spin_lock_irqsave(&dev->event_lock, irqflags); drm_send_event_helper(dev, e, 0); spin_unlock_irqrestore(&dev->event_lock, irqflags); } EXPORT_SYMBOL(drm_send_event); void drm_fdinfo_print_size(struct drm_printer *p, const char *prefix, const char *stat, const char *region, u64 sz) { const char *units[] = {"", " KiB", " MiB"}; unsigned u; for (u = 0; u < ARRAY_SIZE(units) - 1; u++) { if (sz == 0 || !IS_ALIGNED(sz, SZ_1K)) break; sz = div_u64(sz, SZ_1K); } drm_printf(p, "%s-%s-%s:\t%llu%s\n", prefix, stat, region, sz, units[u]); } EXPORT_SYMBOL(drm_fdinfo_print_size); int drm_memory_stats_is_zero(const struct drm_memory_stats *stats) { return (stats->shared == 0 && stats->private == 0 && stats->resident == 0 && stats->purgeable == 0 && stats->active == 0); } EXPORT_SYMBOL(drm_memory_stats_is_zero); /** * drm_print_memory_stats - A helper to print memory stats * @p: The printer to print output to * @stats: The collected memory stats * @supported_status: Bitmask of optional stats which are available * @region: The memory region * */ void drm_print_memory_stats(struct drm_printer *p, const struct drm_memory_stats *stats, enum drm_gem_object_status supported_status, const char *region) { const char *prefix = "drm"; drm_fdinfo_print_size(p, prefix, "total", region, stats->private + stats->shared); drm_fdinfo_print_size(p, prefix, "shared", region, stats->shared); if (supported_status & DRM_GEM_OBJECT_ACTIVE) drm_fdinfo_print_size(p, prefix, "active", region, stats->active); if (supported_status & DRM_GEM_OBJECT_RESIDENT) drm_fdinfo_print_size(p, prefix, "resident", region, stats->resident); if (supported_status & DRM_GEM_OBJECT_PURGEABLE) drm_fdinfo_print_size(p, prefix, "purgeable", region, stats->purgeable); } EXPORT_SYMBOL(drm_print_memory_stats); /** * drm_show_memory_stats - Helper to collect and show standard fdinfo memory stats * @p: the printer to print output to * @file: the DRM file * * Helper to iterate over GEM objects with a handle allocated in the specified * file. */ void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file) { struct drm_gem_object *obj; struct drm_memory_stats status = {}; enum drm_gem_object_status supported_status = 0; int id; spin_lock(&file->table_lock); idr_for_each_entry (&file->object_idr, obj, id) { enum drm_gem_object_status s = 0; size_t add_size = (obj->funcs && obj->funcs->rss) ? obj->funcs->rss(obj) : obj->size; if (obj->funcs && obj->funcs->status) { s = obj->funcs->status(obj); supported_status |= s; } if (drm_gem_object_is_shared_for_memory_stats(obj)) status.shared += obj->size; else status.private += obj->size; if (s & DRM_GEM_OBJECT_RESIDENT) { status.resident += add_size; } else { /* If already purged or not yet backed by pages, don't * count it as purgeable: */ s &= ~DRM_GEM_OBJECT_PURGEABLE; } if (!dma_resv_test_signaled(obj->resv, dma_resv_usage_rw(true))) { status.active += add_size; supported_status |= DRM_GEM_OBJECT_ACTIVE; /* If still active, don't count as purgeable: */ s &= ~DRM_GEM_OBJECT_PURGEABLE; } if (s & DRM_GEM_OBJECT_PURGEABLE) status.purgeable += add_size; } spin_unlock(&file->table_lock); drm_print_memory_stats(p, &status, supported_status, "memory"); } EXPORT_SYMBOL(drm_show_memory_stats); /** * drm_show_fdinfo - helper for drm file fops * @m: output stream * @f: the device file instance * * Helper to implement fdinfo, for userspace to query usage stats, etc, of a * process using the GPU. See also &drm_driver.show_fdinfo. * * For text output format description please see Documentation/gpu/drm-usage-stats.rst */ void drm_show_fdinfo(struct seq_file *m, struct file *f) { struct drm_file *file = f->private_data; struct drm_device *dev = file->minor->dev; struct drm_printer p = drm_seq_file_printer(m); int idx; if (!drm_dev_enter(dev, &idx)) return; drm_printf(&p, "drm-driver:\t%s\n", dev->driver->name); drm_printf(&p, "drm-client-id:\t%llu\n", file->client_id); if (dev_is_pci(dev->dev)) { struct pci_dev *pdev = to_pci_dev(dev->dev); drm_printf(&p, "drm-pdev:\t%04x:%02x:%02x.%d\n", pci_domain_nr(pdev->bus), pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); } mutex_lock(&file->client_name_lock); if (file->client_name) drm_printf(&p, "drm-client-name:\t%s\n", file->client_name); mutex_unlock(&file->client_name_lock); if (dev->driver->show_fdinfo) dev->driver->show_fdinfo(&p, file); drm_dev_exit(idx); } EXPORT_SYMBOL(drm_show_fdinfo); /** * drm_file_err - log process name, pid and client_name associated with a drm_file * @file_priv: context of interest for process name and pid * @fmt: printf() like format string * * Helper function for clients which needs to log process details such * as name and pid etc along with user logs. */ void drm_file_err(struct drm_file *file_priv, const char *fmt, ...) { va_list args; struct va_format vaf; struct pid *pid; struct task_struct *task; struct drm_device *dev = file_priv->minor->dev; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; mutex_lock(&file_priv->client_name_lock); rcu_read_lock(); pid = rcu_dereference(file_priv->pid); task = pid_task(pid, PIDTYPE_TGID); drm_err(dev, "comm: %s pid: %d client-id:%llu client: %s ... %pV", task ? task->comm : "Unset", task ? task->pid : 0, file_priv->client_id, file_priv->client_name ?: "Unset", &vaf); va_end(args); rcu_read_unlock(); mutex_unlock(&file_priv->client_name_lock); } EXPORT_SYMBOL(drm_file_err); /** * mock_drm_getfile - Create a new struct file for the drm device * @minor: drm minor to wrap (e.g. #drm_device.primary) * @flags: file creation mode (O_RDWR etc) * * This create a new struct file that wraps a DRM file context around a * DRM minor. This mimicks userspace opening e.g. /dev/dri/card0, but without * invoking userspace. The struct file may be operated on using its f_op * (the drm_device.driver.fops) to mimick userspace operations, or be supplied * to userspace facing functions as an internal/anonymous client. * * RETURNS: * Pointer to newly created struct file, ERR_PTR on failure. */ struct file *mock_drm_getfile(struct drm_minor *minor, unsigned int flags) { struct drm_device *dev = minor->dev; struct drm_file *priv; struct file *file; priv = drm_file_alloc(minor); if (IS_ERR(priv)) return ERR_CAST(priv); file = anon_inode_getfile("drm", dev->driver->fops, priv, flags); if (IS_ERR(file)) { drm_file_free(priv); return file; } /* Everyone shares a single global address space */ file->f_mapping = dev->anon_inode->i_mapping; drm_dev_get(dev); priv->filp = file; return file; } EXPORT_SYMBOL_FOR_TESTS_ONLY(mock_drm_getfile);
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IOBITMAP_H #define _ASM_X86_IOBITMAP_H #include <linux/refcount.h> #include <asm/processor.h> struct io_bitmap { u64 sequence; refcount_t refcnt; /* The maximum number of bytes to copy so all zero bits are covered */ unsigned int max; unsigned long bitmap[IO_BITMAP_LONGS]; }; struct task_struct; #ifdef CONFIG_X86_IOPL_IOPERM void io_bitmap_share(struct task_struct *tsk); void io_bitmap_exit(struct task_struct *tsk); static inline void native_tss_invalidate_io_bitmap(void) { /* * Invalidate the I/O bitmap by moving io_bitmap_base outside the * TSS limit so any subsequent I/O access from user space will * trigger a #GP. * * This is correct even when VMEXIT rewrites the TSS limit * to 0x67 as the only requirement is that the base points * outside the limit. */ this_cpu_write(cpu_tss_rw.x86_tss.io_bitmap_base, IO_BITMAP_OFFSET_INVALID); } void native_tss_update_io_bitmap(void); #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #define tss_update_io_bitmap native_tss_update_io_bitmap #define tss_invalidate_io_bitmap native_tss_invalidate_io_bitmap #endif #else static inline void io_bitmap_share(struct task_struct *tsk) { } static inline void io_bitmap_exit(struct task_struct *tsk) { } static inline void tss_update_io_bitmap(void) { } #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHE_H #define _BCACHE_H /* * SOME HIGH LEVEL CODE DOCUMENTATION: * * Bcache mostly works with cache sets, cache devices, and backing devices. * * Support for multiple cache devices hasn't quite been finished off yet, but * it's about 95% plumbed through. A cache set and its cache devices is sort of * like a md raid array and its component devices. Most of the code doesn't care * about individual cache devices, the main abstraction is the cache set. * * Multiple cache devices is intended to give us the ability to mirror dirty * cached data and metadata, without mirroring clean cached data. * * Backing devices are different, in that they have a lifetime independent of a * cache set. When you register a newly formatted backing device it'll come up * in passthrough mode, and then you can attach and detach a backing device from * a cache set at runtime - while it's mounted and in use. Detaching implicitly * invalidates any cached data for that backing device. * * A cache set can have multiple (many) backing devices attached to it. * * There's also flash only volumes - this is the reason for the distinction * between struct cached_dev and struct bcache_device. A flash only volume * works much like a bcache device that has a backing device, except the * "cached" data is always dirty. The end result is that we get thin * provisioning with very little additional code. * * Flash only volumes work but they're not production ready because the moving * garbage collector needs more work. More on that later. * * BUCKETS/ALLOCATION: * * Bcache is primarily designed for caching, which means that in normal * operation all of our available space will be allocated. Thus, we need an * efficient way of deleting things from the cache so we can write new things to * it. * * To do this, we first divide the cache device up into buckets. A bucket is the * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ * works efficiently. * * Each bucket has a 16 bit priority, and an 8 bit generation associated with * it. The gens and priorities for all the buckets are stored contiguously and * packed on disk (in a linked list of buckets - aside from the superblock, all * of bcache's metadata is stored in buckets). * * The priority is used to implement an LRU. We reset a bucket's priority when * we allocate it or on cache it, and every so often we decrement the priority * of each bucket. It could be used to implement something more sophisticated, * if anyone ever gets around to it. * * The generation is used for invalidating buckets. Each pointer also has an 8 * bit generation embedded in it; for a pointer to be considered valid, its gen * must match the gen of the bucket it points into. Thus, to reuse a bucket all * we have to do is increment its gen (and write its new gen to disk; we batch * this up). * * Bcache is entirely COW - we never write twice to a bucket, even buckets that * contain metadata (including btree nodes). * * THE BTREE: * * Bcache is in large part design around the btree. * * At a high level, the btree is just an index of key -> ptr tuples. * * Keys represent extents, and thus have a size field. Keys also have a variable * number of pointers attached to them (potentially zero, which is handy for * invalidating the cache). * * The key itself is an inode:offset pair. The inode number corresponds to a * backing device or a flash only volume. The offset is the ending offset of the * extent within the inode - not the starting offset; this makes lookups * slightly more convenient. * * Pointers contain the cache device id, the offset on that device, and an 8 bit * generation number. More on the gen later. * * Index lookups are not fully abstracted - cache lookups in particular are * still somewhat mixed in with the btree code, but things are headed in that * direction. * * Updates are fairly well abstracted, though. There are two different ways of * updating the btree; insert and replace. * * BTREE_INSERT will just take a list of keys and insert them into the btree - * overwriting (possibly only partially) any extents they overlap with. This is * used to update the index after a write. * * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is * overwriting a key that matches another given key. This is used for inserting * data into the cache after a cache miss, and for background writeback, and for * the moving garbage collector. * * There is no "delete" operation; deleting things from the index is * accomplished by either by invalidating pointers (by incrementing a bucket's * gen) or by inserting a key with 0 pointers - which will overwrite anything * previously present at that location in the index. * * This means that there are always stale/invalid keys in the btree. They're * filtered out by the code that iterates through a btree node, and removed when * a btree node is rewritten. * * BTREE NODES: * * Our unit of allocation is a bucket, and we can't arbitrarily allocate and * free smaller than a bucket - so, that's how big our btree nodes are. * * (If buckets are really big we'll only use part of the bucket for a btree node * - no less than 1/4th - but a bucket still contains no more than a single * btree node. I'd actually like to change this, but for now we rely on the * bucket's gen for deleting btree nodes when we rewrite/split a node.) * * Anyways, btree nodes are big - big enough to be inefficient with a textbook * btree implementation. * * The way this is solved is that btree nodes are internally log structured; we * can append new keys to an existing btree node without rewriting it. This * means each set of keys we write is sorted, but the node is not. * * We maintain this log structure in memory - keeping 1Mb of keys sorted would * be expensive, and we have to distinguish between the keys we have written and * the keys we haven't. So to do a lookup in a btree node, we have to search * each sorted set. But we do merge written sets together lazily, so the cost of * these extra searches is quite low (normally most of the keys in a btree node * will be in one big set, and then there'll be one or two sets that are much * smaller). * * This log structure makes bcache's btree more of a hybrid between a * conventional btree and a compacting data structure, with some of the * advantages of both. * * GARBAGE COLLECTION: * * We can't just invalidate any bucket - it might contain dirty data or * metadata. If it once contained dirty data, other writes might overwrite it * later, leaving no valid pointers into that bucket in the index. * * Thus, the primary purpose of garbage collection is to find buckets to reuse. * It also counts how much valid data it each bucket currently contains, so that * allocation can reuse buckets sooner when they've been mostly overwritten. * * It also does some things that are really internal to the btree * implementation. If a btree node contains pointers that are stale by more than * some threshold, it rewrites the btree node to avoid the bucket's generation * wrapping around. It also merges adjacent btree nodes if they're empty enough. * * THE JOURNAL: * * Bcache's journal is not necessary for consistency; we always strictly * order metadata writes so that the btree and everything else is consistent on * disk in the event of an unclean shutdown, and in fact bcache had writeback * caching (with recovery from unclean shutdown) before journalling was * implemented. * * Rather, the journal is purely a performance optimization; we can't complete a * write until we've updated the index on disk, otherwise the cache would be * inconsistent in the event of an unclean shutdown. This means that without the * journal, on random write workloads we constantly have to update all the leaf * nodes in the btree, and those writes will be mostly empty (appending at most * a few keys each) - highly inefficient in terms of amount of metadata writes, * and it puts more strain on the various btree resorting/compacting code. * * The journal is just a log of keys we've inserted; on startup we just reinsert * all the keys in the open journal entries. That means that when we're updating * a node in the btree, we can wait until a 4k block of keys fills up before * writing them out. * * For simplicity, we only journal updates to leaf nodes; updates to parent * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth * the complexity to deal with journalling them (in particular, journal replay) * - updates to non leaf nodes just happen synchronously (see btree_split()). */ #define pr_fmt(fmt) "bcache: %s() " fmt, __func__ #include <linux/bio.h> #include <linux/closure.h> #include <linux/kobject.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/rwsem.h> #include <linux/refcount.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/kthread.h> #include "bcache_ondisk.h" #include "bset.h" #include "util.h" struct bucket { atomic_t pin; uint16_t prio; uint8_t gen; uint8_t last_gc; /* Most out of date gen in the btree */ uint16_t gc_mark; /* Bitfield used by GC. See below for field */ uint16_t reclaimable_in_gc:1; }; /* * I'd use bitfields for these, but I don't trust the compiler not to screw me * as multiple threads touch struct bucket without locking */ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); #define GC_MARK_RECLAIMABLE 1 #define GC_MARK_DIRTY 2 #define GC_MARK_METADATA 3 #define GC_SECTORS_USED_SIZE 13 #define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE)) BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE); BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); #include "journal.h" #include "stats.h" struct search; struct btree; struct keybuf; struct keybuf_key { struct rb_node node; BKEY_PADDED(key); void *private; }; struct keybuf { struct bkey last_scanned; spinlock_t lock; /* * Beginning and end of range in rb tree - so that we can skip taking * lock and checking the rb tree when we need to check for overlapping * keys. */ struct bkey start; struct bkey end; struct rb_root keys; #define KEYBUF_NR 500 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); }; struct bcache_device { struct closure cl; struct kobject kobj; struct cache_set *c; unsigned int id; #define BCACHEDEVNAME_SIZE 12 char name[BCACHEDEVNAME_SIZE]; struct gendisk *disk; unsigned long flags; #define BCACHE_DEV_CLOSING 0 #define BCACHE_DEV_DETACHING 1 #define BCACHE_DEV_UNLINK_DONE 2 #define BCACHE_DEV_WB_RUNNING 3 #define BCACHE_DEV_RATE_DW_RUNNING 4 int nr_stripes; #define BCH_MIN_STRIPE_SZ ((4 << 20) >> SECTOR_SHIFT) unsigned int stripe_size; atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; struct bio_set bio_split; unsigned int data_csum:1; int (*cache_miss)(struct btree *b, struct search *s, struct bio *bio, unsigned int sectors); int (*ioctl)(struct bcache_device *d, blk_mode_t mode, unsigned int cmd, unsigned long arg); }; struct io { /* Used to track sequential IO so it can be skipped */ struct hlist_node hash; struct list_head lru; unsigned long jiffies; unsigned int sequential; sector_t last; }; enum stop_on_failure { BCH_CACHED_DEV_STOP_AUTO = 0, BCH_CACHED_DEV_STOP_ALWAYS, BCH_CACHED_DEV_STOP_MODE_MAX, }; struct cached_dev { struct list_head list; struct bcache_device disk; struct block_device *bdev; struct file *bdev_file; struct cache_sb sb; struct cache_sb_disk *sb_disk; struct bio sb_bio; struct bio_vec sb_bv[1]; struct closure sb_write; struct semaphore sb_write_mutex; /* Refcount on the cache set. Always nonzero when we're caching. */ refcount_t count; struct work_struct detach; /* * Device might not be running if it's dirty and the cache set hasn't * showed up yet. */ atomic_t running; /* * Writes take a shared lock from start to finish; scanning for dirty * data to refill the rb tree requires an exclusive lock. */ struct rw_semaphore writeback_lock; /* * Nonzero, and writeback has a refcount (d->count), iff there is dirty * data in the cache. Protected by writeback_lock; must have an * shared lock to set and exclusive lock to clear. */ atomic_t has_dirty; #define BCH_CACHE_READA_ALL 0 #define BCH_CACHE_READA_META_ONLY 1 unsigned int cache_readahead_policy; struct bch_ratelimit writeback_rate; struct delayed_work writeback_rate_update; /* Limit number of writeback bios in flight */ struct semaphore in_flight; struct task_struct *writeback_thread; struct workqueue_struct *writeback_write_wq; struct keybuf writeback_keys; struct task_struct *status_update_thread; /* * Order the write-half of writeback operations strongly in dispatch * order. (Maintain LBA order; don't allow reads completing out of * order to re-order the writes...) */ struct closure_waitlist writeback_ordering_wait; atomic_t writeback_sequence_next; /* For tracking sequential IO */ #define RECENT_IO_BITS 7 #define RECENT_IO (1 << RECENT_IO_BITS) struct io io[RECENT_IO]; struct hlist_head io_hash[RECENT_IO + 1]; struct list_head io_lru; spinlock_t io_lock; struct cache_accounting accounting; /* The rest of this all shows up in sysfs */ unsigned int sequential_cutoff; unsigned int io_disable:1; unsigned int verify:1; unsigned int bypass_torture_test:1; unsigned int partial_stripes_expensive:1; unsigned int writeback_metadata:1; unsigned int writeback_running:1; unsigned int writeback_consider_fragment:1; unsigned char writeback_percent; unsigned int writeback_delay; uint64_t writeback_rate_target; int64_t writeback_rate_proportional; int64_t writeback_rate_integral; int64_t writeback_rate_integral_scaled; int32_t writeback_rate_change; unsigned int writeback_rate_update_seconds; unsigned int writeback_rate_i_term_inverse; unsigned int writeback_rate_p_term_inverse; unsigned int writeback_rate_fp_term_low; unsigned int writeback_rate_fp_term_mid; unsigned int writeback_rate_fp_term_high; unsigned int writeback_rate_minimum; enum stop_on_failure stop_when_cache_set_failed; #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 atomic_t io_errors; unsigned int error_limit; unsigned int offline_seconds; /* * Retry to update writeback_rate if contention happens for * down_read(dc->writeback_lock) in update_writeback_rate() */ #define BCH_WBRATE_UPDATE_MAX_SKIPS 15 unsigned int rate_update_retry; }; enum alloc_reserve { RESERVE_BTREE, RESERVE_PRIO, RESERVE_MOVINGGC, RESERVE_NONE, RESERVE_NR, }; struct cache { struct cache_set *set; struct cache_sb sb; struct cache_sb_disk *sb_disk; struct bio sb_bio; struct bio_vec sb_bv[1]; struct kobject kobj; struct block_device *bdev; struct file *bdev_file; struct task_struct *alloc_thread; struct closure prio; struct prio_set *disk_buckets; /* * When allocating new buckets, prio_write() gets first dibs - since we * may not be allocate at all without writing priorities and gens. * prio_last_buckets[] contains the last buckets we wrote priorities to * (so gc can mark them as metadata), prio_buckets[] contains the * buckets allocated for the next prio write. */ uint64_t *prio_buckets; uint64_t *prio_last_buckets; /* * free: Buckets that are ready to be used * * free_inc: Incoming buckets - these are buckets that currently have * cached data in them, and we can't reuse them until after we write * their new gen to disk. After prio_write() finishes writing the new * gens/prios, they'll be moved to the free list. */ DECLARE_FIFO(long, free)[RESERVE_NR]; DECLARE_FIFO(long, free_inc); size_t fifo_last_bucket; /* Allocation stuff: */ struct bucket *buckets; DECLARE_HEAP(struct bucket *, heap); /* * If nonzero, we know we aren't going to find any buckets to invalidate * until a gc finishes - otherwise we could pointlessly burn a ton of * cpu */ unsigned int invalidate_needs_gc; struct journal_device journal; /* The rest of this all shows up in sysfs */ #define IO_ERROR_SHIFT 20 atomic_t io_errors; atomic_t io_count; atomic_long_t meta_sectors_written; atomic_long_t btree_sectors_written; atomic_long_t sectors_written; }; struct gc_stat { size_t nodes; size_t nodes_pre; size_t key_bytes; size_t nkeys; uint64_t data; /* sectors */ unsigned int in_use; /* percent */ }; /* * Flag bits, for how the cache set is shutting down, and what phase it's at: * * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching * all the backing devices first (their cached data gets invalidated, and they * won't automatically reattach). * * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. * flushing dirty data). * * CACHE_SET_RUNNING means all cache devices have been registered and journal * replay is complete. * * CACHE_SET_IO_DISABLE is set when bcache is stopping the whold cache set, all * external and internal I/O should be denied when this flag is set. * */ #define CACHE_SET_UNREGISTERING 0 #define CACHE_SET_STOPPING 1 #define CACHE_SET_RUNNING 2 #define CACHE_SET_IO_DISABLE 3 struct cache_set { struct closure cl; struct list_head list; struct kobject kobj; struct kobject internal; struct dentry *debug; struct cache_accounting accounting; unsigned long flags; atomic_t idle_counter; atomic_t at_max_writeback_rate; struct cache *cache; struct bcache_device **devices; unsigned int devices_max_used; atomic_t attached_dev_nr; struct list_head cached_devs; uint64_t cached_dev_sectors; atomic_long_t flash_dev_dirty_sectors; struct closure caching; struct closure sb_write; struct semaphore sb_write_mutex; mempool_t search; mempool_t bio_meta; struct bio_set bio_split; /* For the btree cache */ struct shrinker *shrink; /* For the btree cache and anything allocation related */ struct mutex bucket_lock; /* log2(bucket_size), in sectors */ unsigned short bucket_bits; /* log2(block_size), in sectors */ unsigned short block_bits; /* * Default number of pages for a new btree node - may be less than a * full bucket */ unsigned int btree_pages; /* * Lists of struct btrees; lru is the list for structs that have memory * allocated for actual btree node, freed is for structs that do not. * * We never free a struct btree, except on shutdown - we just put it on * the btree_cache_freed list and reuse it later. This simplifies the * code, and it doesn't cost us much memory as the memory usage is * dominated by buffers that hold the actual btree node data and those * can be freed - and the number of struct btrees allocated is * effectively bounded. * * btree_cache_freeable effectively is a small cache - we use it because * high order page allocations can be rather expensive, and it's quite * common to delete and allocate btree nodes in quick succession. It * should never grow past ~2-3 nodes in practice. */ struct list_head btree_cache; struct list_head btree_cache_freeable; struct list_head btree_cache_freed; /* Number of elements in btree_cache + btree_cache_freeable lists */ unsigned int btree_cache_used; /* * If we need to allocate memory for a new btree node and that * allocation fails, we can cannibalize another node in the btree cache * to satisfy the allocation - lock to guarantee only one thread does * this at a time: */ wait_queue_head_t btree_cache_wait; struct task_struct *btree_cache_alloc_lock; spinlock_t btree_cannibalize_lock; /* * When we free a btree node, we increment the gen of the bucket the * node is in - but we can't rewrite the prios and gens until we * finished whatever it is we were doing, otherwise after a crash the * btree node would be freed but for say a split, we might not have the * pointers to the new nodes inserted into the btree yet. * * This is a refcount that blocks prio_write() until the new keys are * written. */ atomic_t prio_blocked; wait_queue_head_t bucket_wait; atomic_t bucket_wait_cnt; /* * For any bio we don't skip we subtract the number of sectors from * rescale; when it hits 0 we rescale all the bucket priorities. */ atomic_t rescale; /* * used for GC, identify if any front side I/Os is inflight */ atomic_t search_inflight; /* * When we invalidate buckets, we use both the priority and the amount * of good data to determine which buckets to reuse first - to weight * those together consistently we keep track of the smallest nonzero * priority of any bucket. */ uint16_t min_prio; /* * max(gen - last_gc) for all buckets. When it gets too big we have to * gc to keep gens from wrapping around. */ uint8_t need_gc; struct gc_stat gc_stats; size_t nbuckets; size_t avail_nbuckets; struct task_struct *gc_thread; /* Where in the btree gc currently is */ struct bkey gc_done; /* * For automatical garbage collection after writeback completed, this * varialbe is used as bit fields, * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback * - 0000 0010b (BCH_DO_AUTO_GC): do gc after writeback * This is an optimization for following write request after writeback * finished, but read hit rate dropped due to clean data on cache is * discarded. Unless user explicitly sets it via sysfs, it won't be * enabled. */ #define BCH_ENABLE_AUTO_GC 1 #define BCH_DO_AUTO_GC 2 uint8_t gc_after_writeback; /* * The allocation code needs gc_mark in struct bucket to be correct, but * it's not while a gc is in progress. Protected by bucket_lock. */ int gc_mark_valid; /* Counts how many sectors bio_insert has added to the cache */ atomic_t sectors_to_gc; wait_queue_head_t gc_wait; struct keybuf moving_gc_keys; /* Number of moving GC bios in flight */ struct semaphore moving_in_flight; struct workqueue_struct *moving_gc_wq; struct btree *root; #ifdef CONFIG_BCACHE_DEBUG struct btree *verify_data; struct bset *verify_ondisk; struct mutex verify_lock; #endif uint8_t set_uuid[16]; unsigned int nr_uuids; struct uuid_entry *uuids; BKEY_PADDED(uuid_bucket); struct closure uuid_write; struct semaphore uuid_write_mutex; /* * A btree node on disk could have too many bsets for an iterator to fit * on the stack - have to dynamically allocate them. * bch_cache_set_alloc() will make sure the pool can allocate iterators * equipped with enough room that can host * (sb.bucket_size / sb.block_size) * btree_iter_sets, which is more than static MAX_BSETS. */ mempool_t fill_iter; struct bset_sort_state sort; /* List of buckets we're currently writing data to */ struct list_head data_buckets; spinlock_t data_bucket_lock; struct journal journal; #define CONGESTED_MAX 1024 unsigned int congested_last_us; atomic_t congested; /* The rest of this all shows up in sysfs */ unsigned int congested_read_threshold_us; unsigned int congested_write_threshold_us; struct time_stats btree_gc_time; struct time_stats btree_split_time; struct time_stats btree_read_time; atomic_long_t cache_read_races; atomic_long_t writeback_keys_done; atomic_long_t writeback_keys_failed; atomic_long_t reclaim; atomic_long_t reclaimed_journal_buckets; atomic_long_t flush_write; enum { ON_ERROR_UNREGISTER, ON_ERROR_PANIC, } on_error; #define DEFAULT_IO_ERROR_LIMIT 8 unsigned int error_limit; unsigned int error_decay; unsigned short journal_delay_ms; bool expensive_debug_checks; unsigned int verify:1; unsigned int key_merging_disabled:1; unsigned int gc_always_rewrite:1; unsigned int shrinker_disabled:1; unsigned int copy_gc_enabled:1; unsigned int idle_max_writeback_rate_enabled:1; #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; }; struct bbio { unsigned int submit_time_us; union { struct bkey key; uint64_t _pad[3]; /* * We only need pad = 3 here because we only ever carry around a * single pointer - i.e. the pointer we're doing io to/from. */ }; struct bio bio; }; #define BTREE_PRIO USHRT_MAX #define INITIAL_PRIO 32768U #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) #define btree_blocks(b) \ ((unsigned int) (KEY_SIZE(&b->key) >> (b)->c->block_bits)) #define btree_default_blocks(c) \ ((unsigned int) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) #define bucket_bytes(ca) ((ca)->sb.bucket_size << 9) #define block_bytes(ca) ((ca)->sb.block_size << 9) static inline unsigned int meta_bucket_pages(struct cache_sb *sb) { unsigned int n, max_pages; max_pages = min_t(unsigned int, __rounddown_pow_of_two(USHRT_MAX) / PAGE_SECTORS, MAX_ORDER_NR_PAGES); n = sb->bucket_size / PAGE_SECTORS; if (n > max_pages) n = max_pages; return n; } static inline unsigned int meta_bucket_bytes(struct cache_sb *sb) { return meta_bucket_pages(sb) << PAGE_SHIFT; } #define prios_per_bucket(ca) \ ((meta_bucket_bytes(&(ca)->sb) - sizeof(struct prio_set)) / \ sizeof(struct bucket_disk)) #define prio_buckets(ca) \ DIV_ROUND_UP((size_t) (ca)->sb.nbuckets, prios_per_bucket(ca)) static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) { return s >> c->bucket_bits; } static inline sector_t bucket_to_sector(struct cache_set *c, size_t b) { return ((sector_t) b) << c->bucket_bits; } static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) { return s & (c->cache->sb.bucket_size - 1); } static inline size_t PTR_BUCKET_NR(struct cache_set *c, const struct bkey *k, unsigned int ptr) { return sector_to_bucket(c, PTR_OFFSET(k, ptr)); } static inline struct bucket *PTR_BUCKET(struct cache_set *c, const struct bkey *k, unsigned int ptr) { return c->cache->buckets + PTR_BUCKET_NR(c, k, ptr); } static inline uint8_t gen_after(uint8_t a, uint8_t b) { uint8_t r = a - b; return r > 128U ? 0 : r; } static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, unsigned int i) { return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); } static inline bool ptr_available(struct cache_set *c, const struct bkey *k, unsigned int i) { return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && c->cache; } /* Btree key macros */ /* * This is used for various on disk data structures - cache_sb, prio_set, bset, * jset: The checksum is _always_ the first 8 bytes of these structs */ #define csum_set(i) \ bch_crc64(((void *) (i)) + sizeof(uint64_t), \ ((void *) bset_bkey_last(i)) - \ (((void *) (i)) + sizeof(uint64_t))) /* Error handling macros */ #define btree_bug(b, ...) \ do { \ if (bch_cache_set_error((b)->c, __VA_ARGS__)) \ dump_stack(); \ } while (0) #define cache_bug(c, ...) \ do { \ if (bch_cache_set_error(c, __VA_ARGS__)) \ dump_stack(); \ } while (0) #define btree_bug_on(cond, b, ...) \ do { \ if (cond) \ btree_bug(b, __VA_ARGS__); \ } while (0) #define cache_bug_on(cond, c, ...) \ do { \ if (cond) \ cache_bug(c, __VA_ARGS__); \ } while (0) #define cache_set_err_on(cond, c, ...) \ do { \ if (cond) \ bch_cache_set_error(c, __VA_ARGS__); \ } while (0) /* Looping macros */ #define for_each_bucket(b, ca) \ for (b = (ca)->buckets + (ca)->sb.first_bucket; \ b < (ca)->buckets + (ca)->sb.nbuckets; b++) static inline void cached_dev_put(struct cached_dev *dc) { if (refcount_dec_and_test(&dc->count)) schedule_work(&dc->detach); } static inline bool cached_dev_get(struct cached_dev *dc) { if (!refcount_inc_not_zero(&dc->count)) return false; /* Paired with the mb in cached_dev_attach */ smp_mb__after_atomic(); return true; } /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree (last_gc). */ static inline uint8_t bucket_gc_gen(struct bucket *b) { return b->gen - b->last_gc; } #define BUCKET_GC_GEN_MAX 96U #define kobj_attribute_write(n, fn) \ static struct kobj_attribute ksysfs_##n = __ATTR(n, 0200, NULL, fn) #define kobj_attribute_rw(n, show, store) \ static struct kobj_attribute ksysfs_##n = \ __ATTR(n, 0600, show, store) static inline void wake_up_allocators(struct cache_set *c) { struct cache *ca = c->cache; wake_up_process(ca->alloc_thread); } static inline void closure_bio_submit(struct cache_set *c, struct bio *bio, struct closure *cl) { closure_get(cl); if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; } submit_bio_noacct(bio); } /* * Prevent the kthread exits directly, and make sure when kthread_stop() * is called to stop a kthread, it is still alive. If a kthread might be * stopped by CACHE_SET_IO_DISABLE bit set, wait_for_kthread_stop() is * necessary before the kthread returns. */ static inline void wait_for_kthread_stop(void) { while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); schedule(); } } /* Forward declarations */ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); void bch_count_io_errors(struct cache *ca, blk_status_t error, int is_read, const char *m); void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, blk_status_t error, const char *m); void bch_bbio_endio(struct cache_set *c, struct bio *bio, blk_status_t error, const char *m); void bch_bbio_free(struct bio *bio, struct cache_set *c); struct bio *bch_bbio_alloc(struct cache_set *c); void __bch_submit_bbio(struct bio *bio, struct cache_set *c); void bch_submit_bbio(struct bio *bio, struct cache_set *c, struct bkey *k, unsigned int ptr); uint8_t bch_inc_gen(struct cache *ca, struct bucket *b); void bch_rescale_priorities(struct cache_set *c, int sectors); bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b); void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b); void __bch_bucket_free(struct cache *ca, struct bucket *b); void bch_bucket_free(struct cache_set *c, struct bkey *k); long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait); int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, struct bkey *k, bool wait); int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, struct bkey *k, bool wait); bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned int sectors, unsigned int write_point, unsigned int write_prio, bool wait); bool bch_cached_dev_error(struct cached_dev *dc); __printf(2, 3) bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); int bch_prio_write(struct cache *ca, bool wait); void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); extern struct workqueue_struct *bcache_wq; extern struct workqueue_struct *bch_journal_wq; extern struct workqueue_struct *bch_flush_wq; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; extern const struct kobj_type bch_cached_dev_ktype; extern const struct kobj_type bch_flash_dev_ktype; extern const struct kobj_type bch_cache_set_ktype; extern const struct kobj_type bch_cache_set_internal_ktype; extern const struct kobj_type bch_cache_ktype; void bch_cached_dev_release(struct kobject *kobj); void bch_flash_dev_release(struct kobject *kobj); void bch_cache_set_release(struct kobject *kobj); void bch_cache_release(struct kobject *kobj); int bch_uuid_write(struct cache_set *c); void bcache_write_super(struct cache_set *c); int bch_flash_dev_create(struct cache_set *c, uint64_t size); int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, uint8_t *set_uuid); void bch_cached_dev_detach(struct cached_dev *dc); int bch_cached_dev_run(struct cached_dev *dc); void bcache_device_stop(struct bcache_device *d); void bch_cache_set_unregister(struct cache_set *c); void bch_cache_set_stop(struct cache_set *c); struct cache_set *bch_cache_set_alloc(struct cache_sb *sb); void bch_btree_cache_free(struct cache_set *c); int bch_btree_cache_alloc(struct cache_set *c); void bch_moving_init_cache_set(struct cache_set *c); int bch_open_buckets_alloc(struct cache_set *c); void bch_open_buckets_free(struct cache_set *c); int bch_cache_allocator_start(struct cache *ca); void bch_debug_exit(void); void bch_debug_init(void); void bch_request_exit(void); int bch_request_init(void); void bch_btree_exit(void); int bch_btree_init(void); #endif /* _BCACHE_H */
2 36 47 36 52 52 46 40 40 46 13 13 13 13 46 45 13 13 13 9 13 46 13 32 32 32 32 32 12 12 12 12 12 12 12 52 13 37 46 36 47 47 46 46 46 46 46 46 46 6 13 2 46 46 32 32 32 12 12 32 32 32 32 2 2 2 10 32 46 39 32 32 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008-2014 Mathieu Desnoyers */ #include <linux/module.h> #include <linux/mutex.h> #include <linux/types.h> #include <linux/jhash.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/tracepoint.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/static_key.h> enum tp_func_state { TP_FUNC_0, TP_FUNC_1, TP_FUNC_2, TP_FUNC_N, }; extern tracepoint_ptr_t __start___tracepoints_ptrs[]; extern tracepoint_ptr_t __stop___tracepoints_ptrs[]; enum tp_transition_sync { TP_TRANSITION_SYNC_1_0_1, TP_TRANSITION_SYNC_N_2_1, _NR_TP_TRANSITION_SYNC, }; struct tp_transition_snapshot { unsigned long rcu; bool ongoing; }; /* Protected by tracepoints_mutex */ static struct tp_transition_snapshot tp_transition_snapshot[_NR_TP_TRANSITION_SYNC]; static void tp_rcu_get_state(enum tp_transition_sync sync) { struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync]; /* Keep the latest get_state snapshot. */ snapshot->rcu = get_state_synchronize_rcu(); snapshot->ongoing = true; } static void tp_rcu_cond_sync(enum tp_transition_sync sync) { struct tp_transition_snapshot *snapshot = &tp_transition_snapshot[sync]; if (!snapshot->ongoing) return; cond_synchronize_rcu(snapshot->rcu); snapshot->ongoing = false; } /* Set to 1 to enable tracepoint debug output */ static const int tracepoint_debug; #ifdef CONFIG_MODULES /* * Tracepoint module list mutex protects the local module list. */ static DEFINE_MUTEX(tracepoint_module_list_mutex); /* Local list of struct tp_module */ static LIST_HEAD(tracepoint_module_list); #endif /* CONFIG_MODULES */ /* * tracepoints_mutex protects the builtin and module tracepoints. * tracepoints_mutex nests inside tracepoint_module_list_mutex. */ static DEFINE_MUTEX(tracepoints_mutex); /* * Note about RCU : * It is used to delay the free of multiple probes array until a quiescent * state is reached. */ struct tp_probes { struct rcu_head rcu; struct tracepoint_func probes[]; }; /* Called in removal of a func but failed to allocate a new tp_funcs */ static void tp_stub_func(void) { return; } static inline void *allocate_probes(int count) { struct tp_probes *p = kmalloc(struct_size(p, probes, count), GFP_KERNEL); return p == NULL ? NULL : p->probes; } static void rcu_free_old_probes(struct rcu_head *head) { kfree(container_of(head, struct tp_probes, rcu)); } static inline void release_probes(struct tracepoint *tp, struct tracepoint_func *old) { if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); if (tracepoint_is_faultable(tp)) call_rcu_tasks_trace(&tp_probes->rcu, rcu_free_old_probes); else call_rcu(&tp_probes->rcu, rcu_free_old_probes); } } static void debug_print_probes(struct tracepoint_func *funcs) { int i; if (!tracepoint_debug || !funcs) return; for (i = 0; funcs[i].func; i++) printk(KERN_DEBUG "Probe %d : %pSb\n", i, funcs[i].func); } static struct tracepoint_func * func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func, int prio) { struct tracepoint_func *old, *new; int iter_probes; /* Iterate over old probe array. */ int nr_probes = 0; /* Counter for probes */ int pos = -1; /* Insertion position into new array */ if (WARN_ON(!tp_func->func)) return ERR_PTR(-EINVAL); debug_print_probes(*funcs); old = *funcs; if (old) { /* (N -> N+1), (N != 0, 1) probes */ for (iter_probes = 0; old[iter_probes].func; iter_probes++) { if (old[iter_probes].func == tp_stub_func) continue; /* Skip stub functions. */ if (old[iter_probes].func == tp_func->func && old[iter_probes].data == tp_func->data) return ERR_PTR(-EEXIST); nr_probes++; } } /* + 2 : one for new probe, one for NULL func */ new = allocate_probes(nr_probes + 2); if (new == NULL) return ERR_PTR(-ENOMEM); if (old) { nr_probes = 0; for (iter_probes = 0; old[iter_probes].func; iter_probes++) { if (old[iter_probes].func == tp_stub_func) continue; /* Insert before probes of lower priority */ if (pos < 0 && old[iter_probes].prio < prio) pos = nr_probes++; new[nr_probes++] = old[iter_probes]; } if (pos < 0) pos = nr_probes++; /* nr_probes now points to the end of the new array */ } else { pos = 0; nr_probes = 1; /* must point at end of array */ } new[pos] = *tp_func; new[nr_probes].func = NULL; *funcs = new; debug_print_probes(*funcs); return old; } static void *func_remove(struct tracepoint_func **funcs, struct tracepoint_func *tp_func) { int nr_probes = 0, nr_del = 0, i; struct tracepoint_func *old, *new; old = *funcs; if (!old) return ERR_PTR(-ENOENT); debug_print_probes(*funcs); /* (N -> M), (N > 1, M >= 0) probes */ if (tp_func->func) { for (nr_probes = 0; old[nr_probes].func; nr_probes++) { if ((old[nr_probes].func == tp_func->func && old[nr_probes].data == tp_func->data) || old[nr_probes].func == tp_stub_func) nr_del++; } } /* * If probe is NULL, then nr_probes = nr_del = 0, and then the * entire entry will be removed. */ if (nr_probes - nr_del == 0) { /* N -> 0, (N > 1) */ *funcs = NULL; debug_print_probes(*funcs); return old; } else { int j = 0; /* N -> M, (N > 1, M > 0) */ /* + 1 for NULL */ new = allocate_probes(nr_probes - nr_del + 1); if (new) { for (i = 0; old[i].func; i++) { if ((old[i].func != tp_func->func || old[i].data != tp_func->data) && old[i].func != tp_stub_func) new[j++] = old[i]; } new[nr_probes - nr_del].func = NULL; *funcs = new; } else { /* * Failed to allocate, replace the old function * with calls to tp_stub_func. */ for (i = 0; old[i].func; i++) { if (old[i].func == tp_func->func && old[i].data == tp_func->data) WRITE_ONCE(old[i].func, tp_stub_func); } *funcs = old; } } debug_print_probes(*funcs); return old; } /* * Count the number of functions (enum tp_func_state) in a tp_funcs array. */ static enum tp_func_state nr_func_state(const struct tracepoint_func *tp_funcs) { if (!tp_funcs) return TP_FUNC_0; if (!tp_funcs[1].func) return TP_FUNC_1; if (!tp_funcs[2].func) return TP_FUNC_2; return TP_FUNC_N; /* 3 or more */ } static void tracepoint_update_call(struct tracepoint *tp, struct tracepoint_func *tp_funcs) { void *func = tp->iterator; /* Synthetic events do not have static call sites */ if (!tp->static_call_key) return; if (nr_func_state(tp_funcs) == TP_FUNC_1) func = tp_funcs[0].func; __static_call_update(tp->static_call_key, tp->static_call_tramp, func); } /* * Add the probe function to a tracepoint. */ static int tracepoint_add_func(struct tracepoint *tp, struct tracepoint_func *func, int prio, bool warn) { struct tracepoint_func *old, *tp_funcs; int ret; if (tp->ext && tp->ext->regfunc && !static_key_enabled(&tp->key)) { ret = tp->ext->regfunc(); if (ret < 0) return ret; } tp_funcs = rcu_dereference_protected(tp->funcs, lockdep_is_held(&tracepoints_mutex)); old = func_add(&tp_funcs, func, prio); if (IS_ERR(old)) { WARN_ON_ONCE(warn && PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } /* * rcu_assign_pointer has as smp_store_release() which makes sure * that the new probe callbacks array is consistent before setting * a pointer to it. This array is referenced by __DO_TRACE from * include/linux/tracepoint.h using rcu_dereference_sched(). */ switch (nr_func_state(tp_funcs)) { case TP_FUNC_1: /* 0->1 */ /* * Make sure new static func never uses old data after a * 1->0->1 transition sequence. */ tp_rcu_cond_sync(TP_TRANSITION_SYNC_1_0_1); /* Set static call to first function */ tracepoint_update_call(tp, tp_funcs); /* Both iterator and static call handle NULL tp->funcs */ rcu_assign_pointer(tp->funcs, tp_funcs); static_branch_enable(&tp->key); break; case TP_FUNC_2: /* 1->2 */ /* Set iterator static call */ tracepoint_update_call(tp, tp_funcs); /* * Iterator callback installed before updating tp->funcs. * Requires ordering between RCU assign/dereference and * static call update/call. */ fallthrough; case TP_FUNC_N: /* N->N+1 (N>1) */ rcu_assign_pointer(tp->funcs, tp_funcs); /* * Make sure static func never uses incorrect data after a * N->...->2->1 (N>1) transition sequence. */ if (tp_funcs[0].data != old[0].data) tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1); break; default: WARN_ON_ONCE(1); break; } release_probes(tp, old); return 0; } /* * Remove a probe function from a tracepoint. * Note: only waiting an RCU period after setting elem->call to the empty * function insures that the original callback is not used anymore. This insured * by preempt_disable around the call site. */ static int tracepoint_remove_func(struct tracepoint *tp, struct tracepoint_func *func) { struct tracepoint_func *old, *tp_funcs; tp_funcs = rcu_dereference_protected(tp->funcs, lockdep_is_held(&tracepoints_mutex)); old = func_remove(&tp_funcs, func); if (WARN_ON_ONCE(IS_ERR(old))) return PTR_ERR(old); if (tp_funcs == old) /* Failed allocating new tp_funcs, replaced func with stub */ return 0; switch (nr_func_state(tp_funcs)) { case TP_FUNC_0: /* 1->0 */ /* Removed last function */ if (tp->ext && tp->ext->unregfunc && static_key_enabled(&tp->key)) tp->ext->unregfunc(); static_branch_disable(&tp->key); /* Set iterator static call */ tracepoint_update_call(tp, tp_funcs); /* Both iterator and static call handle NULL tp->funcs */ rcu_assign_pointer(tp->funcs, NULL); /* * Make sure new static func never uses old data after a * 1->0->1 transition sequence. */ tp_rcu_get_state(TP_TRANSITION_SYNC_1_0_1); break; case TP_FUNC_1: /* 2->1 */ rcu_assign_pointer(tp->funcs, tp_funcs); /* * Make sure static func never uses incorrect data after a * N->...->2->1 (N>2) transition sequence. If the first * element's data has changed, then force the synchronization * to prevent current readers that have loaded the old data * from calling the new function. */ if (tp_funcs[0].data != old[0].data) tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1); tp_rcu_cond_sync(TP_TRANSITION_SYNC_N_2_1); /* Set static call to first function */ tracepoint_update_call(tp, tp_funcs); break; case TP_FUNC_2: /* N->N-1 (N>2) */ fallthrough; case TP_FUNC_N: rcu_assign_pointer(tp->funcs, tp_funcs); /* * Make sure static func never uses incorrect data after a * N->...->2->1 (N>2) transition sequence. */ if (tp_funcs[0].data != old[0].data) tp_rcu_get_state(TP_TRANSITION_SYNC_N_2_1); break; default: WARN_ON_ONCE(1); break; } release_probes(tp, old); return 0; } /** * tracepoint_probe_register_prio_may_exist - Connect a probe to a tracepoint with priority * @tp: tracepoint * @probe: probe handler * @data: tracepoint data * @prio: priority of this function over other registered functions * * Same as tracepoint_probe_register_prio() except that it will not warn * if the tracepoint is already registered. */ int tracepoint_probe_register_prio_may_exist(struct tracepoint *tp, void *probe, void *data, int prio) { struct tracepoint_func tp_func; int ret; mutex_lock(&tracepoints_mutex); tp_func.func = probe; tp_func.data = data; tp_func.prio = prio; ret = tracepoint_add_func(tp, &tp_func, prio, false); mutex_unlock(&tracepoints_mutex); return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio_may_exist); /** * tracepoint_probe_register_prio - Connect a probe to a tracepoint with priority * @tp: tracepoint * @probe: probe handler * @data: tracepoint data * @prio: priority of this function over other registered functions * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for * unregistering the probe before the module is gone. This can be * performed either with a tracepoint module going notifier, or from * within module exit functions. */ int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe, void *data, int prio) { struct tracepoint_func tp_func; int ret; mutex_lock(&tracepoints_mutex); tp_func.func = probe; tp_func.data = data; tp_func.prio = prio; ret = tracepoint_add_func(tp, &tp_func, prio, true); mutex_unlock(&tracepoints_mutex); return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio); /** * tracepoint_probe_register - Connect a probe to a tracepoint * @tp: tracepoint * @probe: probe handler * @data: tracepoint data * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for * unregistering the probe before the module is gone. This can be * performed either with a tracepoint module going notifier, or from * within module exit functions. */ int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data) { return tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO); } EXPORT_SYMBOL_GPL(tracepoint_probe_register); /** * tracepoint_probe_unregister - Disconnect a probe from a tracepoint * @tp: tracepoint * @probe: probe function pointer * @data: tracepoint data * * Returns 0 if ok, error value on error. */ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) { struct tracepoint_func tp_func; int ret; mutex_lock(&tracepoints_mutex); tp_func.func = probe; tp_func.data = data; ret = tracepoint_remove_func(tp, &tp_func); mutex_unlock(&tracepoints_mutex); return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); static void for_each_tracepoint_range( tracepoint_ptr_t *begin, tracepoint_ptr_t *end, void (*fct)(struct tracepoint *tp, void *priv), void *priv) { tracepoint_ptr_t *iter; if (!begin) return; for (iter = begin; iter < end; iter++) fct(tracepoint_ptr_deref(iter), priv); } #ifdef CONFIG_MODULES bool trace_module_has_bad_taint(struct module *mod) { return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | (1 << TAINT_UNSIGNED_MODULE) | (1 << TAINT_TEST) | (1 << TAINT_LIVEPATCH)); } static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); /** * register_tracepoint_module_notifier - register tracepoint coming/going notifier * @nb: notifier block * * Notifiers registered with this function are called on module * coming/going with the tracepoint_module_list_mutex held. * The notifier block callback should expect a "struct tp_module" data * pointer. */ int register_tracepoint_module_notifier(struct notifier_block *nb) { struct tp_module *tp_mod; int ret; mutex_lock(&tracepoint_module_list_mutex); ret = blocking_notifier_chain_register(&tracepoint_notify_list, nb); if (ret) goto end; list_for_each_entry(tp_mod, &tracepoint_module_list, list) (void) nb->notifier_call(nb, MODULE_STATE_COMING, tp_mod); end: mutex_unlock(&tracepoint_module_list_mutex); return ret; } EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier); /** * unregister_tracepoint_module_notifier - unregister tracepoint coming/going notifier * @nb: notifier block * * The notifier block callback should expect a "struct tp_module" data * pointer. */ int unregister_tracepoint_module_notifier(struct notifier_block *nb) { struct tp_module *tp_mod; int ret; mutex_lock(&tracepoint_module_list_mutex); ret = blocking_notifier_chain_unregister(&tracepoint_notify_list, nb); if (ret) goto end; list_for_each_entry(tp_mod, &tracepoint_module_list, list) (void) nb->notifier_call(nb, MODULE_STATE_GOING, tp_mod); end: mutex_unlock(&tracepoint_module_list_mutex); return ret; } EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier); /* * Ensure the tracer unregistered the module's probes before the module * teardown is performed. Prevents leaks of probe and data pointers. */ static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv) { WARN_ON_ONCE(tp->funcs); } static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod; if (!mod->num_tracepoints) return 0; /* * We skip modules that taint the kernel, especially those with different * module headers (for forced load), to make sure we don't cause a crash. * Staging, out-of-tree, unsigned GPL, and test modules are fine. */ if (trace_module_has_bad_taint(mod)) return 0; tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); if (!tp_mod) return -ENOMEM; tp_mod->mod = mod; mutex_lock(&tracepoint_module_list_mutex); list_add_tail(&tp_mod->list, &tracepoint_module_list); blocking_notifier_call_chain(&tracepoint_notify_list, MODULE_STATE_COMING, tp_mod); mutex_unlock(&tracepoint_module_list_mutex); return 0; } static void tracepoint_module_going(struct module *mod) { struct tp_module *tp_mod; if (!mod->num_tracepoints) return; mutex_lock(&tracepoint_module_list_mutex); list_for_each_entry(tp_mod, &tracepoint_module_list, list) { if (tp_mod->mod == mod) { blocking_notifier_call_chain(&tracepoint_notify_list, MODULE_STATE_GOING, tp_mod); list_del(&tp_mod->list); kfree(tp_mod); /* * Called the going notifier before checking for * quiescence. */ for_each_tracepoint_range(mod->tracepoints_ptrs, mod->tracepoints_ptrs + mod->num_tracepoints, tp_module_going_check_quiescent, NULL); break; } } /* * In the case of modules that were tainted at "coming", we'll simply * walk through the list without finding it. We cannot use the "tainted" * flag on "going", in case a module taints the kernel only after being * loaded. */ mutex_unlock(&tracepoint_module_list_mutex); } static int tracepoint_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; int ret = 0; switch (val) { case MODULE_STATE_COMING: ret = tracepoint_module_coming(mod); break; case MODULE_STATE_LIVE: break; case MODULE_STATE_GOING: tracepoint_module_going(mod); break; case MODULE_STATE_UNFORMED: break; } return notifier_from_errno(ret); } static struct notifier_block tracepoint_module_nb = { .notifier_call = tracepoint_module_notify, .priority = 0, }; static __init int init_tracepoints(void) { int ret; ret = register_module_notifier(&tracepoint_module_nb); if (ret) pr_warn("Failed to register tracepoint module enter notifier\n"); return ret; } __initcall(init_tracepoints); /** * for_each_tracepoint_in_module - iteration on all tracepoints in a module * @mod: module * @fct: callback * @priv: private data */ void for_each_tracepoint_in_module(struct module *mod, void (*fct)(struct tracepoint *tp, struct module *mod, void *priv), void *priv) { tracepoint_ptr_t *begin, *end, *iter; lockdep_assert_held(&tracepoint_module_list_mutex); if (!mod) return; begin = mod->tracepoints_ptrs; end = mod->tracepoints_ptrs + mod->num_tracepoints; for (iter = begin; iter < end; iter++) fct(tracepoint_ptr_deref(iter), mod, priv); } /** * for_each_module_tracepoint - iteration on all tracepoints in all modules * @fct: callback * @priv: private data */ void for_each_module_tracepoint(void (*fct)(struct tracepoint *tp, struct module *mod, void *priv), void *priv) { struct tp_module *tp_mod; mutex_lock(&tracepoint_module_list_mutex); list_for_each_entry(tp_mod, &tracepoint_module_list, list) for_each_tracepoint_in_module(tp_mod->mod, fct, priv); mutex_unlock(&tracepoint_module_list_mutex); } #endif /* CONFIG_MODULES */ /** * for_each_kernel_tracepoint - iteration on all kernel tracepoints * @fct: callback * @priv: private data */ void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv), void *priv) { for_each_tracepoint_range(__start___tracepoints_ptrs, __stop___tracepoints_ptrs, fct, priv); } EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint); #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS /* NB: reg/unreg are called while guarded with the tracepoints_mutex */ static int sys_tracepoint_refcount; int syscall_regfunc(void) { struct task_struct *p, *t; if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { set_task_syscall_work(t, SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } sys_tracepoint_refcount++; return 0; } void syscall_unregfunc(void) { struct task_struct *p, *t; sys_tracepoint_refcount--; if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { clear_task_syscall_work(t, SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } } #endif
56 56 48 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_IOMAP_H #define LINUX_IOMAP_H 1 #include <linux/atomic.h> #include <linux/bitmap.h> #include <linux/blk_types.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/mm_types.h> #include <linux/blkdev.h> #include <linux/pagevec.h> struct address_space; struct fiemap_extent_info; struct inode; struct iomap_iter; struct iomap_dio; struct iomap_writepage_ctx; struct iomap_read_folio_ctx; struct iov_iter; struct kiocb; struct page; struct vm_area_struct; struct vm_fault; /* * Types of block ranges for iomap mappings: */ #define IOMAP_HOLE 0 /* no blocks allocated, need allocation */ #define IOMAP_DELALLOC 1 /* delayed allocation blocks */ #define IOMAP_MAPPED 2 /* blocks allocated at @addr */ #define IOMAP_UNWRITTEN 3 /* blocks allocated at @addr in unwritten state */ #define IOMAP_INLINE 4 /* data inline in the inode */ /* * Flags reported by the file system from iomap_begin: * * IOMAP_F_NEW indicates that the blocks have been newly allocated and need * zeroing for areas that no data is copied to. * * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access * written data and requires fdatasync to commit them to persistent storage. * This needs to take into account metadata changes that *may* be made at IO * completion, such as file size updates from direct IO. * * IOMAP_F_SHARED indicates that the blocks are shared, and will need to be * unshared as part a write. * * IOMAP_F_MERGED indicates that the iomap contains the merge of multiple block * mappings. * * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of * buffer heads for this mapping. * * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent * rather than a file data extent. * * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must * never be merged with the mapping before it. * * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block * assigned to it yet and the file system will do that in the bio submission * handler, splitting the I/O as needed. * * IOMAP_F_ATOMIC_BIO indicates that (write) I/O will be issued as an atomic * bio, i.e. set REQ_ATOMIC. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) #define IOMAP_F_SHARED (1U << 2) #define IOMAP_F_MERGED (1U << 3) #ifdef CONFIG_BUFFER_HEAD #define IOMAP_F_BUFFER_HEAD (1U << 4) #else #define IOMAP_F_BUFFER_HEAD 0 #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) #define IOMAP_F_BOUNDARY (1U << 6) #define IOMAP_F_ANON_WRITE (1U << 7) #define IOMAP_F_ATOMIC_BIO (1U << 8) /* * Flag reserved for file system specific usage */ #define IOMAP_F_PRIVATE (1U << 12) /* * Flags set by the core iomap code during operations: * * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size * has changed as the result of this write operation. * * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file * range it covers needs to be remapped by the high level before the operation * can proceed. */ #define IOMAP_F_SIZE_CHANGED (1U << 14) #define IOMAP_F_STALE (1U << 15) /* * Magic value for addr: */ #define IOMAP_NULL_ADDR -1ULL /* addr is not valid */ struct iomap { u64 addr; /* disk offset of mapping, bytes */ loff_t offset; /* file offset of mapping, bytes */ u64 length; /* length of mapping, bytes */ u16 type; /* type of mapping */ u16 flags; /* flags for mapping */ struct block_device *bdev; /* block device for I/O */ struct dax_device *dax_dev; /* dax_dev for dax operations */ void *inline_data; void *private; /* filesystem private */ u64 validity_cookie; /* used with .iomap_valid() */ }; static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) { if (iomap->flags & IOMAP_F_ANON_WRITE) return U64_MAX; /* invalid */ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } /* * Returns the inline data pointer for logical offset @pos. */ static inline void *iomap_inline_data(const struct iomap *iomap, loff_t pos) { return iomap->inline_data + pos - iomap->offset; } /* * Check if the mapping's length is within the valid range for inline data. * This is used to guard against accessing data beyond the page inline_data * points at. */ static inline bool iomap_inline_data_valid(const struct iomap *iomap) { return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data); } /* * When get_folio succeeds, put_folio will always be called to do any * cleanup work necessary. put_folio is responsible for unlocking and putting * @folio. */ struct iomap_write_ops { struct folio *(*get_folio)(struct iomap_iter *iter, loff_t pos, unsigned len); void (*put_folio)(struct inode *inode, loff_t pos, unsigned copied, struct folio *folio); /* * Check that the cached iomap still maps correctly to the filesystem's * internal extent map. FS internal extent maps can change while iomap * is iterating a cached iomap, so this hook allows iomap to detect that * the iomap needs to be refreshed during a long running write * operation. * * The filesystem can store internal state (e.g. a sequence number) in * iomap->validity_cookie when the iomap is first mapped to be able to * detect changes between mapping time and whenever .iomap_valid() is * called. * * This is called with the folio over the specified file position held * locked by the iomap code. */ bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap); /* * Optional if the filesystem wishes to provide a custom handler for * reading in the contents of a folio, otherwise iomap will default to * submitting a bio read request. * * The read must be done synchronously. */ int (*read_folio_range)(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t len); }; /* * Flags for iomap_begin / iomap_end. No flag implies a read. */ #define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */ #define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ #define IOMAP_FAULT (1 << 3) /* mapping for page fault */ #define IOMAP_DIRECT (1 << 4) /* direct I/O */ #define IOMAP_NOWAIT (1 << 5) /* do not block */ #define IOMAP_OVERWRITE_ONLY (1 << 6) /* only pure overwrites allowed */ #define IOMAP_UNSHARE (1 << 7) /* unshare_file_range */ #ifdef CONFIG_FS_DAX #define IOMAP_DAX (1 << 8) /* DAX mapping */ #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ #define IOMAP_ATOMIC (1 << 9) /* torn-write protection */ #define IOMAP_DONTCACHE (1 << 10) struct iomap_ops { /* * Return the existing mapping at pos, or reserve space starting at * pos for up to length, as long as we can do it as a single mapping. * The actual length is returned in iomap->length. */ int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap); /* * Commit and/or unreserve space previous allocated using iomap_begin. * Written indicates the length of the successful write operation which * needs to be commited, while the rest needs to be unreserved. * Written might be zero if no data was written. */ int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap); }; /** * struct iomap_iter - Iterate through a range of a file * @inode: Set at the start of the iteration and should not change. * @pos: The current file position we are operating on. It is updated by * calls to iomap_iter(). Treat as read-only in the body. * @len: The remaining length of the file segment we're operating on. * It is updated at the same time as @pos. * @iter_start_pos: The original start pos for the current iomap. Used for * incremental iter advance. * @status: Status of the most recent iteration. Zero on success or a negative * errno on error. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations */ struct iomap_iter { struct inode *inode; loff_t pos; u64 len; loff_t iter_start_pos; int status; unsigned flags; struct iomap iomap; struct iomap srcmap; struct folio_batch *fbatch; void *private; }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); int iomap_iter_advance(struct iomap_iter *iter, u64 count); /** * iomap_length_trim - trimmed length of the current iomap iteration * @iter: iteration structure * @pos: File position to trim from. * @len: Length of the mapping to trim to. * * Returns a trimmed length that the operation applies to for the current * iteration. */ static inline u64 iomap_length_trim(const struct iomap_iter *iter, loff_t pos, u64 len) { u64 end = iter->iomap.offset + iter->iomap.length; if (iter->srcmap.type != IOMAP_HOLE) end = min(end, iter->srcmap.offset + iter->srcmap.length); return min(len, end - pos); } /** * iomap_length - length of the current iomap iteration * @iter: iteration structure * * Returns the length that the operation applies to for the current iteration. */ static inline u64 iomap_length(const struct iomap_iter *iter) { return iomap_length_trim(iter, iter->pos, iter->len); } /** * iomap_iter_advance_full - advance by the full length of current map */ static inline int iomap_iter_advance_full(struct iomap_iter *iter) { return iomap_iter_advance(iter, iomap_length(iter)); } /** * iomap_iter_srcmap - return the source map for the current iomap iteration * @i: iteration structure * * Write operations on file systems with reflink support might require a * source and a destination map. This function retourns the source map * for a given operation, which may or may no be identical to the destination * map in &i->iomap. */ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) { if (i->srcmap.type != IOMAP_HOLE) return &i->srcmap; return &i->iomap; } /* * Return the file offset for the first unchanged block after a short write. * * If nothing was written, round @pos down to point at the first block in * the range, else round up to include the partially written block. */ static inline loff_t iomap_last_written_block(struct inode *inode, loff_t pos, ssize_t written) { if (unlikely(!written)) return round_down(pos, i_blocksize(inode)); return round_up(pos + written, i_blocksize(inode)); } /* * Check if the range needs to be unshared for a FALLOC_FL_UNSHARE_RANGE * operation. * * Don't bother with blocks that are not shared to start with; or mappings that * cannot be shared, such as inline data, delalloc reservations, holes or * unwritten extents. * * Note that we use srcmap directly instead of iomap_iter_srcmap as unsharing * requires providing a separate source map, and the presence of one is a good * indicator that unsharing is needed, unlike IOMAP_F_SHARED which can be set * for any data that goes into the COW fork for XFS. */ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) { return (iter->iomap.flags & IOMAP_F_SHARED) && iter->srcmap.type == IOMAP_MAPPED; } ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); void iomap_read_folio(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx); void iomap_readahead(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops); loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset, loff_t length); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, struct iomap *iomap); void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, loff_t end_byte, unsigned flags, struct iomap *iomap, iomap_punch_t punch); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, const struct iomap_ops *ops); loff_t iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops); loff_t iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops); sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops); /* * Flags for iomap_ioend->io_flags. */ /* shared COW extent */ #define IOMAP_IOEND_SHARED (1U << 0) /* unwritten extent */ #define IOMAP_IOEND_UNWRITTEN (1U << 1) /* don't merge into previous ioend */ #define IOMAP_IOEND_BOUNDARY (1U << 2) /* is direct I/O */ #define IOMAP_IOEND_DIRECT (1U << 3) /* is DONTCACHE I/O */ #define IOMAP_IOEND_DONTCACHE (1U << 4) /* * Flags that if set on either ioend prevent the merge of two ioends. * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) */ #define IOMAP_IOEND_NOMERGE_FLAGS \ (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT | \ IOMAP_IOEND_DONTCACHE) /* * Structure for writeback I/O completions. * * File systems can split a bio generated by iomap. In that case the parent * ioend it was split from is recorded in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ atomic_t io_remaining; /* completetion defer count */ int io_error; /* stashed away status */ struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ void *io_private; /* file system private data */ struct bio io_bio; /* MUST BE LAST! */ }; static inline struct iomap_ioend *iomap_ioend_from_bio(struct bio *bio) { return container_of(bio, struct iomap_ioend, io_bio); } struct iomap_writeback_ops { /* * Performs writeback on the passed in range * * Can map arbitrarily large regions, but we need to call into it at * least once per folio to allow the file systems to synchronize with * the write path that could be invalidating mappings. * * An existing mapping from a previous call to this method can be reused * by the file system if it is still valid. * * If this succeeds, iomap_finish_folio_write() must be called once * writeback completes for the range, regardless of whether the * writeback succeeded or failed. * * Returns the number of bytes processed or a negative errno. */ ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc, struct folio *folio, u64 pos, unsigned int len, u64 end_pos); /* * Submit a writeback context previously build up by ->writeback_range. * * Returns 0 if the context was successfully submitted, or a negative * error code if not. If @error is non-zero a failure occurred, and * the writeback context should be completed with an error. */ int (*writeback_submit)(struct iomap_writepage_ctx *wpc, int error); }; struct iomap_writepage_ctx { struct iomap iomap; struct inode *inode; struct writeback_control *wbc; const struct iomap_writeback_ops *ops; u32 nr_folios; /* folios added to the ioend */ void *wb_ctx; /* pending writeback context */ }; struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio, loff_t file_offset, u16 ioend_flags); struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, unsigned int max_len, bool is_append); void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends); void iomap_sort_ioends(struct list_head *ioend_list); ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, loff_t pos, loff_t end_pos, unsigned int dirty_len); int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error); void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, int error); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len); int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio); int iomap_writepages(struct iomap_writepage_ctx *wpc); struct iomap_read_folio_ctx { const struct iomap_read_ops *ops; struct folio *cur_folio; struct readahead_control *rac; void *read_ctx; }; struct iomap_read_ops { /* * Read in a folio range. * * If this succeeds, iomap_finish_folio_read() must be called after the * range is read in, regardless of whether the read succeeded or failed. * * Returns 0 on success or a negative error on failure. */ int (*read_folio_range)(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t len); /* * Submit any pending read requests. * * This is optional. */ void (*submit_read)(struct iomap_read_folio_ctx *ctx); }; /* * Flags for direct I/O ->end_io: */ #define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */ #define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */ struct iomap_dio_ops { int (*end_io)(struct kiocb *iocb, ssize_t size, int error, unsigned flags); void (*submit_io)(const struct iomap_iter *iter, struct bio *bio, loff_t file_offset); /* * Filesystems wishing to attach private information to a direct io bio * must provide a ->submit_io method that attaches the additional * information to the bio and changes the ->bi_end_io callback to a * custom function. This function should, at a minimum, perform any * relevant post-processing of the bio and end with a call to * iomap_dio_bio_end_io. */ struct bio_set *bio_set; }; /* * Wait for the I/O to complete in iomap_dio_rw even if the kiocb is not * synchronous. */ #define IOMAP_DIO_FORCE_WAIT (1 << 0) /* * Do not allocate blocks or zero partial blocks, but instead fall back to * the caller by returning -EAGAIN. Used to optimize direct I/O writes that * are not aligned to the file system block size. */ #define IOMAP_DIO_OVERWRITE_ONLY (1 << 1) /* * When a page fault occurs, return a partial synchronous result and allow * the caller to retry the rest of the operation after dealing with the page * fault. */ #define IOMAP_DIO_PARTIAL (1 << 2) /* * Ensure each bio is aligned to fs block size. * * For filesystems which need to calculate/verify the checksum of each fs * block. Otherwise they may not be able to handle unaligned bios. */ #define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3) ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); ssize_t iomap_dio_complete(struct iomap_dio *dio); void iomap_dio_bio_end_io(struct bio *bio); #ifdef CONFIG_SWAP struct file; struct swap_info_struct; int iomap_swapfile_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *pagespan, const struct iomap_ops *ops); #else # define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) #endif /* CONFIG_SWAP */ extern struct bio_set iomap_ioend_bioset; #ifdef CONFIG_BLOCK extern const struct iomap_read_ops iomap_bio_read_ops; static inline void iomap_bio_read_folio(struct folio *folio, const struct iomap_ops *ops) { struct iomap_read_folio_ctx ctx = { .ops = &iomap_bio_read_ops, .cur_folio = folio, }; iomap_read_folio(ops, &ctx); } static inline void iomap_bio_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { struct iomap_read_folio_ctx ctx = { .ops = &iomap_bio_read_ops, .rac = rac, }; iomap_readahead(ops, &ctx); } #endif /* CONFIG_BLOCK */ #endif /* LINUX_IOMAP_H */
7 27 28 4216 530 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MNT_IDMAPPING_H #define _LINUX_MNT_IDMAPPING_H #include <linux/types.h> #include <linux/uidgid.h> struct mnt_idmap; struct user_namespace; extern struct mnt_idmap nop_mnt_idmap; extern struct mnt_idmap invalid_mnt_idmap; extern struct user_namespace init_user_ns; typedef struct { uid_t val; } vfsuid_t; typedef struct { gid_t val; } vfsgid_t; static_assert(sizeof(vfsuid_t) == sizeof(kuid_t)); static_assert(sizeof(vfsgid_t) == sizeof(kgid_t)); static_assert(offsetof(vfsuid_t, val) == offsetof(kuid_t, val)); static_assert(offsetof(vfsgid_t, val) == offsetof(kgid_t, val)); static inline bool is_valid_mnt_idmap(const struct mnt_idmap *idmap) { return idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap; } #ifdef CONFIG_MULTIUSER static inline uid_t __vfsuid_val(vfsuid_t uid) { return uid.val; } static inline gid_t __vfsgid_val(vfsgid_t gid) { return gid.val; } #else static inline uid_t __vfsuid_val(vfsuid_t uid) { return 0; } static inline gid_t __vfsgid_val(vfsgid_t gid) { return 0; } #endif static inline bool vfsuid_valid(vfsuid_t uid) { return __vfsuid_val(uid) != (uid_t)-1; } static inline bool vfsgid_valid(vfsgid_t gid) { return __vfsgid_val(gid) != (gid_t)-1; } static inline bool vfsuid_eq(vfsuid_t left, vfsuid_t right) { return vfsuid_valid(left) && __vfsuid_val(left) == __vfsuid_val(right); } static inline bool vfsgid_eq(vfsgid_t left, vfsgid_t right) { return vfsgid_valid(left) && __vfsgid_val(left) == __vfsgid_val(right); } /** * vfsuid_eq_kuid - check whether kuid and vfsuid have the same value * @vfsuid: the vfsuid to compare * @kuid: the kuid to compare * * Check whether @vfsuid and @kuid have the same values. * * Return: true if @vfsuid and @kuid have the same value, false if not. * Comparison between two invalid uids returns false. */ static inline bool vfsuid_eq_kuid(vfsuid_t vfsuid, kuid_t kuid) { return vfsuid_valid(vfsuid) && __vfsuid_val(vfsuid) == __kuid_val(kuid); } /** * vfsgid_eq_kgid - check whether kgid and vfsgid have the same value * @vfsgid: the vfsgid to compare * @kgid: the kgid to compare * * Check whether @vfsgid and @kgid have the same values. * * Return: true if @vfsgid and @kgid have the same value, false if not. * Comparison between two invalid gids returns false. */ static inline bool vfsgid_eq_kgid(vfsgid_t vfsgid, kgid_t kgid) { return vfsgid_valid(vfsgid) && __vfsgid_val(vfsgid) == __kgid_val(kgid); } /* * vfs{g,u}ids are created from k{g,u}ids. * We don't allow them to be created from regular {u,g}id. */ #define VFSUIDT_INIT(val) (vfsuid_t){ __kuid_val(val) } #define VFSGIDT_INIT(val) (vfsgid_t){ __kgid_val(val) } #define INVALID_VFSUID VFSUIDT_INIT(INVALID_UID) #define INVALID_VFSGID VFSGIDT_INIT(INVALID_GID) /* * Allow a vfs{g,u}id to be used as a k{g,u}id where we want to compare * whether the mapped value is identical to value of a k{g,u}id. */ #define AS_KUIDT(val) (kuid_t){ __vfsuid_val(val) } #define AS_KGIDT(val) (kgid_t){ __vfsgid_val(val) } int vfsgid_in_group_p(vfsgid_t vfsgid); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); vfsuid_t make_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kuid_t kuid); vfsgid_t make_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kgid_t kgid); kuid_t from_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid); kgid_t from_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid); /** * vfsuid_has_fsmapping - check whether a vfsuid maps into the filesystem * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsuid: vfsuid to be mapped * * Check whether @vfsuid has a mapping in the filesystem idmapping. Use this * function to check whether the filesystem idmapping has a mapping for * @vfsuid. * * Return: true if @vfsuid has a mapping in the filesystem, false if not. */ static inline bool vfsuid_has_fsmapping(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { return uid_valid(from_vfsuid(idmap, fs_userns, vfsuid)); } static inline bool vfsuid_has_mapping(struct user_namespace *userns, vfsuid_t vfsuid) { return from_kuid(userns, AS_KUIDT(vfsuid)) != (uid_t)-1; } /** * vfsuid_into_kuid - convert vfsuid into kuid * @vfsuid: the vfsuid to convert * * This can be used when a vfsuid is committed as a kuid. * * Return: a kuid with the value of @vfsuid */ static inline kuid_t vfsuid_into_kuid(vfsuid_t vfsuid) { return AS_KUIDT(vfsuid); } /** * vfsgid_has_fsmapping - check whether a vfsgid maps into the filesystem * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsgid: vfsgid to be mapped * * Check whether @vfsgid has a mapping in the filesystem idmapping. Use this * function to check whether the filesystem idmapping has a mapping for * @vfsgid. * * Return: true if @vfsgid has a mapping in the filesystem, false if not. */ static inline bool vfsgid_has_fsmapping(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { return gid_valid(from_vfsgid(idmap, fs_userns, vfsgid)); } static inline bool vfsgid_has_mapping(struct user_namespace *userns, vfsgid_t vfsgid) { return from_kgid(userns, AS_KGIDT(vfsgid)) != (gid_t)-1; } /** * vfsgid_into_kgid - convert vfsgid into kgid * @vfsgid: the vfsgid to convert * * This can be used when a vfsgid is committed as a kgid. * * Return: a kgid with the value of @vfsgid */ static inline kgid_t vfsgid_into_kgid(vfsgid_t vfsgid) { return AS_KGIDT(vfsgid); } /** * mapped_fsuid - return caller's fsuid mapped according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * * Use this helper to initialize a new vfs or filesystem object based on * the caller's fsuid. A common example is initializing the i_uid field of * a newly allocated inode triggered by a creation event such as mkdir or * O_CREAT. Other examples include the allocation of quotas for a specific * user. * * Return: the caller's current fsuid mapped up according to @idmap. */ static inline kuid_t mapped_fsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns) { return from_vfsuid(idmap, fs_userns, VFSUIDT_INIT(current_fsuid())); } /** * mapped_fsgid - return caller's fsgid mapped according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * * Use this helper to initialize a new vfs or filesystem object based on * the caller's fsgid. A common example is initializing the i_gid field of * a newly allocated inode triggered by a creation event such as mkdir or * O_CREAT. Other examples include the allocation of quotas for a specific * user. * * Return: the caller's current fsgid mapped up according to @idmap. */ static inline kgid_t mapped_fsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns) { return from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(current_fsgid())); } #endif /* _LINUX_MNT_IDMAPPING_H */
979 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Frame handler other utility functions for HSR and PRP. */ #include "hsr_slave.h" #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> #include "hsr_main.h" #include "hsr_device.h" #include "hsr_forward.h" #include "hsr_framereg.h" bool hsr_invalid_dan_ingress_frame(__be16 protocol) { return (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR)); } static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct hsr_port *port; struct hsr_priv *hsr; __be16 protocol; /* Packets from dev_loopback_xmit() do not have L2 header, bail out */ if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: skb invalid", __func__); return RX_HANDLER_PASS; } port = hsr_port_get_rcu(skb->dev); if (!port) goto finish_pass; hsr = port->hsr; if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) { /* Directly kill frames sent by ourselves */ kfree_skb(skb); goto finish_consume; } /* For HSR, only tagged frames are expected (unless the device offloads * HSR tag removal), but for PRP there could be non tagged frames as * well from Single attached nodes (SANs). */ protocol = eth_hdr(skb)->h_proto; if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && port->type != HSR_PT_INTERLINK && hsr->proto_ops->invalid_dan_ingress_frame && hsr->proto_ops->invalid_dan_ingress_frame(protocol)) goto finish_pass; skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) || protocol == htons(ETH_P_HSR)) { if (!pskb_may_pull(skb, ETH_HLEN + HSR_HLEN)) { kfree_skb(skb); goto finish_consume; } skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); } skb_reset_mac_len(skb); /* Only the frames received over the interlink port will assign a * sequence number and require synchronisation vs other sender. */ if (port->type == HSR_PT_INTERLINK) { spin_lock_bh(&hsr->seqnr_lock); hsr_forward_skb(skb, port); spin_unlock_bh(&hsr->seqnr_lock); } else { hsr_forward_skb(skb, port); } finish_consume: return RX_HANDLER_CONSUMED; finish_pass: return RX_HANDLER_PASS; } bool hsr_port_exists(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame; } static int hsr_check_dev_ok(struct net_device *dev, struct netlink_ext_ack *extack) { /* Don't allow HSR on non-ethernet like devices */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN) { NL_SET_ERR_MSG_MOD(extack, "Cannot use loopback or non-ethernet device as HSR slave."); return -EINVAL; } /* Don't allow enslaving hsr devices */ if (is_hsr_master(dev)) { NL_SET_ERR_MSG_MOD(extack, "Cannot create trees of HSR devices."); return -EINVAL; } if (hsr_port_exists(dev)) { NL_SET_ERR_MSG_MOD(extack, "This device is already a HSR slave."); return -EINVAL; } if (is_vlan_dev(dev)) { NL_SET_ERR_MSG_MOD(extack, "HSR on top of VLAN is not yet supported in this driver."); return -EINVAL; } if (dev->priv_flags & IFF_DONT_BRIDGE) { NL_SET_ERR_MSG_MOD(extack, "This device does not support bridging."); return -EOPNOTSUPP; } /* HSR over bonded devices has not been tested, but I'm not sure it * won't work... */ return 0; } /* Setup device to be added to the HSR bridge. */ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, struct hsr_port *port, struct netlink_ext_ack *extack) { struct netdev_lag_upper_info lag_upper_info; struct net_device *hsr_dev; struct hsr_port *master; int res; /* Don't use promiscuous mode for offload since L2 frame forward * happens at the offloaded hardware. */ if (!port->hsr->fwd_offloaded) { res = dev_set_promiscuity(dev, 1); if (res) return res; } master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr_dev = master->dev; lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_BROADCAST; lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN; res = netdev_master_upper_dev_link(dev, hsr_dev, NULL, &lag_upper_info, extack); if (res) goto fail_upper_dev_link; res = netdev_rx_handler_register(dev, hsr_handle_frame, port); if (res) goto fail_rx_handler; dev_disable_lro(dev); return 0; fail_rx_handler: netdev_upper_dev_unlink(dev, hsr_dev); fail_upper_dev_link: if (!port->hsr->fwd_offloaded) dev_set_promiscuity(dev, -1); return res; } int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, enum hsr_port_type type, struct netlink_ext_ack *extack) { struct hsr_port *port, *master; int res; if (type != HSR_PT_MASTER) { res = hsr_check_dev_ok(dev, extack); if (res) return res; } port = hsr_port_get_hsr(hsr, type); if (port) return -EBUSY; /* This port already exists */ port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; port->hsr = hsr; port->dev = dev; port->type = type; ether_addr_copy(port->original_macaddress, dev->dev_addr); list_add_tail_rcu(&port->port_list, &hsr->ports); if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); if (res) goto fail_dev_setup; } master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); return 0; fail_dev_setup: list_del_rcu(&port->port_list); kfree_rcu(port, rcu); return res; } void hsr_del_port(struct hsr_port *port) { struct hsr_priv *hsr; struct hsr_port *master; hsr = port->hsr; master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); list_del_rcu(&port->port_list); if (port != master) { netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); netdev_rx_handler_unregister(port->dev); if (!port->hsr->fwd_offloaded) dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); eth_hw_addr_set(port->dev, port->original_macaddress); } kfree_rcu(port, rcu); }
982 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TIMEX_H #define _ASM_X86_TIMEX_H #include <asm/processor.h> #include <asm/tsc.h> static inline unsigned long random_get_entropy(void) { if (!IS_ENABLED(CONFIG_X86_TSC) && !cpu_feature_enabled(X86_FEATURE_TSC)) return random_get_entropy_fallback(); return rdtsc(); } #define random_get_entropy random_get_entropy /* Assume we use the PIT time source for the clock tick */ #define CLOCK_TICK_RATE PIT_TICK_RATE #define ARCH_HAS_READ_CURRENT_TIMER #endif /* _ASM_X86_TIMEX_H */
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/rculist.h> #include <linux/llist.h> #include "rds_single_path.h" #include "ib_mr.h" #include "rds.h" struct workqueue_struct *rds_ib_mr_wq; static void rds_ib_odp_mr_worker(struct work_struct *work); static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) { struct rds_ib_device *rds_ibdev; struct rds_ib_ipaddr *i_ipaddr; rcu_read_lock(); list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { refcount_inc(&rds_ibdev->refcount); rcu_read_unlock(); return rds_ibdev; } } } rcu_read_unlock(); return NULL; } static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_ipaddr *i_ipaddr; i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL); if (!i_ipaddr) return -ENOMEM; i_ipaddr->ipaddr = ipaddr; spin_lock_irq(&rds_ibdev->spinlock); list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list); spin_unlock_irq(&rds_ibdev->spinlock); return 0; } static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_ipaddr *i_ipaddr; struct rds_ib_ipaddr *to_free = NULL; spin_lock_irq(&rds_ibdev->spinlock); list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { list_del_rcu(&i_ipaddr->list); to_free = i_ipaddr; break; } } spin_unlock_irq(&rds_ibdev->spinlock); if (to_free) kfree_rcu(to_free, rcu); } int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, struct in6_addr *ipaddr) { struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]); if (!rds_ibdev_old) return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); if (rds_ibdev_old != rds_ibdev) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]); rds_ib_dev_put(rds_ibdev_old); return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); } rds_ib_dev_put(rds_ibdev_old); return 0; } void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; /* conn was previously on the nodev_conns_list */ spin_lock_irq(&ib_nodev_conns_lock); BUG_ON(list_empty(&ib_nodev_conns)); BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); spin_lock(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); spin_unlock(&rds_ibdev->spinlock); spin_unlock_irq(&ib_nodev_conns_lock); ic->rds_ibdev = rds_ibdev; refcount_inc(&rds_ibdev->refcount); } void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; /* place conn on nodev_conns_list */ spin_lock(&ib_nodev_conns_lock); spin_lock_irq(&rds_ibdev->spinlock); BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); spin_unlock_irq(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &ib_nodev_conns); spin_unlock(&ib_nodev_conns_lock); ic->rds_ibdev = NULL; rds_ib_dev_put(rds_ibdev); } void rds_ib_destroy_nodev_conns(void) { struct rds_ib_connection *ic, *_ic; LIST_HEAD(tmp_list); /* avoid calling conn_destroy with irqs off */ spin_lock_irq(&ib_nodev_conns_lock); list_splice(&ib_nodev_conns, &tmp_list); spin_unlock_irq(&ib_nodev_conns_lock); list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) rds_conn_destroy(ic->conn); } void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) { struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; iinfo->rdma_mr_max = pool_1m->max_items; iinfo->rdma_mr_size = pool_1m->max_pages; } #if IS_ENABLED(CONFIG_IPV6) void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds6_info_rdma_connection *iinfo6) { struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; iinfo6->rdma_mr_max = pool_1m->max_items; iinfo6->rdma_mr_size = pool_1m->max_pages; } #endif struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; struct llist_node *ret; unsigned long flags; spin_lock_irqsave(&pool->clean_lock, flags); ret = llist_del_first(&pool->clean_list); spin_unlock_irqrestore(&pool->clean_lock, flags); if (ret) { ibmr = llist_entry(ret, struct rds_ib_mr, llnode); if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_reused); else rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); } return ibmr; } void rds_ib_sync_mr(void *trans_private, int direction) { struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; if (ibmr->odp) return; switch (direction) { case DMA_FROM_DEVICE: ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, ibmr->sg_dma_len, DMA_BIDIRECTIONAL); break; case DMA_TO_DEVICE: ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg, ibmr->sg_dma_len, DMA_BIDIRECTIONAL); break; } } void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { struct rds_ib_device *rds_ibdev = ibmr->device; if (ibmr->sg_dma_len) { ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len, DMA_BIDIRECTIONAL); ibmr->sg_dma_len = 0; } /* Release the s/g list */ if (ibmr->sg_len) { unsigned int i; for (i = 0; i < ibmr->sg_len; ++i) { struct page *page = sg_page(&ibmr->sg[i]); /* FIXME we need a way to tell a r/w MR * from a r/o MR */ WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); put_page(page); } kfree(ibmr->sg); ibmr->sg = NULL; ibmr->sg_len = 0; } } void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { unsigned int pinned = ibmr->sg_len; __rds_ib_teardown_mr(ibmr); if (pinned) { struct rds_ib_mr_pool *pool = ibmr->pool; atomic_sub(pinned, &pool->free_pinned); } } static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all) { unsigned int item_count; item_count = atomic_read(&pool->item_count); if (free_all) return item_count; return 0; } /* * given an llist of mrs, put them all into the list_head for more processing */ static unsigned int llist_append_to_list(struct llist_head *llist, struct list_head *list) { struct rds_ib_mr *ibmr; struct llist_node *node; struct llist_node *next; unsigned int count = 0; node = llist_del_all(llist); while (node) { next = node->next; ibmr = llist_entry(node, struct rds_ib_mr, llnode); list_add_tail(&ibmr->unmap_list, list); node = next; count++; } return count; } /* * this takes a list head of mrs and turns it into linked llist nodes * of clusters. Each cluster has linked llist nodes of * MR_CLUSTER_SIZE mrs that are ready for reuse. */ static void list_to_llist_nodes(struct list_head *list, struct llist_node **nodes_head, struct llist_node **nodes_tail) { struct rds_ib_mr *ibmr; struct llist_node *cur = NULL; struct llist_node **next = nodes_head; list_for_each_entry(ibmr, list, unmap_list) { cur = &ibmr->llnode; *next = cur; next = &cur->next; } *next = NULL; *nodes_tail = cur; } /* * Flush our pool of MRs. * At a minimum, all currently unused MRs are unmapped. * If the number of MRs allocated exceeds the limit, we also try * to free as many MRs as needed to get back to this limit. */ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **ibmr_ret) { struct rds_ib_mr *ibmr; struct llist_node *clean_nodes; struct llist_node *clean_tail; LIST_HEAD(unmap_list); unsigned long unpinned = 0; unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush); if (ibmr_ret) { DEFINE_WAIT(wait); while (!mutex_trylock(&pool->flush_lock)) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); goto out_nolock; } prepare_to_wait(&pool->flush_wait, &wait, TASK_UNINTERRUPTIBLE); if (llist_empty(&pool->clean_list)) schedule(); ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); goto out_nolock; } } finish_wait(&pool->flush_wait, &wait); } else mutex_lock(&pool->flush_lock); if (ibmr_ret) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; goto out; } } /* Get the list of all MRs to be dropped. Ordering matters - * we want to put drop_list ahead of free_list. */ dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); if (free_all) { unsigned long flags; spin_lock_irqsave(&pool->clean_lock, flags); llist_append_to_list(&pool->clean_list, &unmap_list); spin_unlock_irqrestore(&pool->clean_lock, flags); } free_goal = rds_ib_flush_goal(pool, free_all); if (list_empty(&unmap_list)) goto out; rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); if (!list_empty(&unmap_list)) { unsigned long flags; list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail); if (ibmr_ret) { *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); clean_nodes = clean_nodes->next; } /* more than one entry in llist nodes */ if (clean_nodes) { spin_lock_irqsave(&pool->clean_lock, flags); llist_add_batch(clean_nodes, clean_tail, &pool->clean_list); spin_unlock_irqrestore(&pool->clean_lock, flags); } } atomic_sub(unpinned, &pool->free_pinned); atomic_sub(dirty_to_clean, &pool->dirty_count); atomic_sub(nfreed, &pool->item_count); out: mutex_unlock(&pool->flush_lock); if (waitqueue_active(&pool->flush_wait)) wake_up(&pool->flush_wait); out_nolock: return 0; } struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; int iter = 0; while (1) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) return ibmr; if (atomic_inc_return(&pool->item_count) <= pool->max_items) break; atomic_dec(&pool->item_count); if (++iter > 2) { if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); break; } /* We do have some empty MRs. Flush them out. */ if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); rds_ib_flush_mr_pool(pool, 0, &ibmr); if (ibmr) return ibmr; } return NULL; } static void rds_ib_mr_pool_flush_worker(struct work_struct *work) { struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); rds_ib_flush_mr_pool(pool, 0, NULL); } void rds_ib_free_mr(void *trans_private, int invalidate) { struct rds_ib_mr *ibmr = trans_private; struct rds_ib_mr_pool *pool = ibmr->pool; struct rds_ib_device *rds_ibdev = ibmr->device; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); if (ibmr->odp) { /* A MR created and marked as use_once. We use delayed work, * because there is a change that we are in interrupt and can't * call to ib_dereg_mr() directly. */ INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker); queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0); return; } /* Return it to the pool's free list */ rds_ib_free_frmr_list(ibmr); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 5) queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { rds_ib_flush_mr_pool(pool, 0, NULL); } else { /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); } } rds_ib_dev_put(rds_ibdev); } void rds_ib_flush_mrs(void) { struct rds_ib_device *rds_ibdev; down_read(&rds_ib_devices_lock); list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { if (rds_ibdev->mr_8k_pool) rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL); if (rds_ibdev->mr_1m_pool) rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL); } up_read(&rds_ib_devices_lock); } u32 rds_ib_get_lkey(void *trans_private) { struct rds_ib_mr *ibmr = trans_private; return ibmr->u.mr->lkey; } void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret, struct rds_connection *conn, u64 start, u64 length, int need_odp) { struct rds_ib_device *rds_ibdev; struct rds_ib_mr *ibmr = NULL; struct rds_ib_connection *ic = NULL; int ret; rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]); if (!rds_ibdev) { ret = -ENODEV; goto out; } if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) { u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start; int access_flags = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_ON_DEMAND); struct ib_sge sge = {}; struct ib_mr *ib_mr; if (!rds_ibdev->odp_capable) { ret = -EOPNOTSUPP; goto out; } ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr, access_flags); if (IS_ERR(ib_mr)) { rdsdebug("rds_ib_get_user_mr returned %d\n", IS_ERR(ib_mr)); ret = PTR_ERR(ib_mr); goto out; } if (key_ret) *key_ret = ib_mr->rkey; ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); if (!ibmr) { ib_dereg_mr(ib_mr); ret = -ENOMEM; goto out; } ibmr->u.mr = ib_mr; ibmr->odp = 1; sge.addr = virt_addr; sge.length = length; sge.lkey = ib_mr->lkey; ib_advise_mr(rds_ibdev->pd, IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE, IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1); return ibmr; } if (conn) ic = conn->c_transport_data; if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) { ret = -ENODEV; goto out; } ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); if (IS_ERR(ibmr)) { ret = PTR_ERR(ibmr); pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret); } else { return ibmr; } out: if (rds_ibdev) rds_ib_dev_put(rds_ibdev); return ERR_PTR(ret); } void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) { cancel_delayed_work_sync(&pool->flush_worker); rds_ib_flush_mr_pool(pool, 1, NULL); WARN_ON(atomic_read(&pool->item_count)); WARN_ON(atomic_read(&pool->free_pinned)); kfree(pool); } struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, int pool_type) { struct rds_ib_mr_pool *pool; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return ERR_PTR(-ENOMEM); pool->pool_type = pool_type; init_llist_head(&pool->free_list); init_llist_head(&pool->drop_list); init_llist_head(&pool->clean_list); spin_lock_init(&pool->clean_lock); mutex_init(&pool->flush_lock); init_waitqueue_head(&pool->flush_wait); INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); if (pool_type == RDS_IB_MR_1M_POOL) { /* +1 allows for unaligned MRs */ pool->max_pages = RDS_MR_1M_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_1m_mrs; } else { /* pool_type == RDS_IB_MR_8K_POOL */ pool->max_pages = RDS_MR_8K_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_8k_mrs; } pool->max_free_pinned = pool->max_items * pool->max_pages / 4; pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4; return pool; } int rds_ib_mr_init(void) { rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!rds_ib_mr_wq) return -ENOMEM; return 0; } /* By the time this is called all the IB devices should have been torn down and * had their pools freed. As each pool is freed its work struct is waited on, * so the pool flushing work queue should be idle by the time we get here. */ void rds_ib_mr_exit(void) { destroy_workqueue(rds_ib_mr_wq); } static void rds_ib_odp_mr_worker(struct work_struct *work) { struct rds_ib_mr *ibmr; ibmr = container_of(work, struct rds_ib_mr, work.work); ib_dereg_mr(ibmr->u.mr); kfree(ibmr); }
7 7 7 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 ethtool hooks for cfg80211 * * Copied from cfg.c - originally * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2014 Intel Corporation (Author: Johannes Berg) * Copyright (C) 2018, 2022-2023 Intel Corporation */ #include <linux/types.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "sta_info.h" #include "driver-ops.h" static int ieee80211_set_ringparam(struct net_device *dev, struct ethtool_ringparam *rp, struct kernel_ethtool_ringparam *kernel_rp, struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0) return -EINVAL; guard(wiphy)(local->hw.wiphy); return drv_set_ringparam(local, rp->tx_pending, rp->rx_pending); } static void ieee80211_get_ringparam(struct net_device *dev, struct ethtool_ringparam *rp, struct kernel_ethtool_ringparam *kernel_rp, struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); memset(rp, 0, sizeof(*rp)); guard(wiphy)(local->hw.wiphy); drv_get_ringparam(local, &rp->tx_pending, &rp->tx_max_pending, &rp->rx_pending, &rp->rx_max_pending); } static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { "rx_packets", "rx_bytes", "rx_duplicates", "rx_fragments", "rx_dropped", "tx_packets", "tx_bytes", "tx_filtered", "tx_retry_failed", "tx_retries", "tx_handlers_drop", "sta_state", "txrate", "rxrate", "signal", "channel", "noise", "ch_time", "ch_time_busy", "ch_time_ext_busy", "ch_time_rx", "ch_time_tx" }; #define STA_STATS_LEN ARRAY_SIZE(ieee80211_gstrings_sta_stats) static int ieee80211_get_sset_count(struct net_device *dev, int sset) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int rv = 0; if (sset == ETH_SS_STATS) rv += STA_STATS_LEN; rv += drv_get_et_sset_count(sdata, sset); if (rv == 0) return -EOPNOTSUPP; return rv; } static void ieee80211_get_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *channel; struct sta_info *sta; struct ieee80211_local *local = sdata->local; struct station_info sinfo; struct survey_info survey; int i, q; #define STA_STATS_SURVEY_LEN 7 memset(data, 0, sizeof(u64) * STA_STATS_LEN); #define ADD_STA_STATS(sta) \ do { \ data[i++] += sinfo.rx_packets; \ data[i++] += sinfo.rx_bytes; \ data[i++] += (sta)->rx_stats.num_duplicates; \ data[i++] += (sta)->rx_stats.fragments; \ data[i++] += sinfo.rx_dropped_misc; \ \ data[i++] += sinfo.tx_packets; \ data[i++] += sinfo.tx_bytes; \ data[i++] += (sta)->status_stats.filtered; \ data[i++] += sinfo.tx_failed; \ data[i++] += sinfo.tx_retries; \ } while (0) /* For Managed stations, find the single station based on BSSID * and use that. For interface types, iterate through all available * stations and add stats for any station that is assigned to this * network device. */ guard(wiphy)(local->hw.wiphy); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sta = sta_info_get_bss(sdata, sdata->deflink.u.mgd.bssid); if (!(sta && !WARN_ON(sta->sdata->dev != dev))) goto do_survey; memset(&sinfo, 0, sizeof(sinfo)); sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(&sta->deflink); data[i++] = sdata->tx_handlers_drop; data[i++] = sta->sta_state; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.txrate); i++; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.rxrate); i++; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG)) data[i] = (u8)sinfo.signal_avg; i++; } else { list_for_each_entry(sta, &local->sta_list, list) { /* Make sure this station belongs to the proper dev */ if (sta->sdata->dev != dev) continue; memset(&sinfo, 0, sizeof(sinfo)); sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(&sta->deflink); data[i++] = sdata->tx_handlers_drop; } } do_survey: i = STA_STATS_LEN - STA_STATS_SURVEY_LEN; /* Get survey stats for current channel */ survey.filled = 0; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (chanctx_conf) channel = chanctx_conf->def.chan; else if (local->open_count > 0 && local->open_count == local->virt_monitors && sdata->vif.type == NL80211_IFTYPE_MONITOR) channel = local->monitor_chanreq.oper.chan; else channel = NULL; rcu_read_unlock(); if (channel) { q = 0; do { survey.filled = 0; if (drv_get_survey(local, q, &survey) != 0) { survey.filled = 0; break; } q++; } while (channel != survey.channel); } if (survey.filled) data[i++] = survey.channel->center_freq; else data[i++] = 0; if (survey.filled & SURVEY_INFO_NOISE_DBM) data[i++] = (u8)survey.noise; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME) data[i++] = survey.time; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_BUSY) data[i++] = survey.time_busy; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_EXT_BUSY) data[i++] = survey.time_ext_busy; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_RX) data[i++] = survey.time_rx; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_TX) data[i++] = survey.time_tx; else data[i++] = -1LL; if (WARN_ON(i != STA_STATS_LEN)) return; drv_get_et_stats(sdata, stats, &(data[STA_STATS_LEN])); } static void ieee80211_get_strings(struct net_device *dev, u32 sset, u8 *data) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int sz_sta_stats = 0; if (sset == ETH_SS_STATS) { sz_sta_stats = sizeof(ieee80211_gstrings_sta_stats); memcpy(data, ieee80211_gstrings_sta_stats, sz_sta_stats); } drv_get_et_strings(sdata, sset, &(data[sz_sta_stats])); } static int ieee80211_get_regs_len(struct net_device *dev) { return 0; } static void ieee80211_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; regs->version = wdev->wiphy->hw_version; regs->len = 0; } const struct ethtool_ops ieee80211_ethtool_ops = { .get_drvinfo = cfg80211_get_drvinfo, .get_regs_len = ieee80211_get_regs_len, .get_regs = ieee80211_get_regs, .get_link = ethtool_op_get_link, .get_ringparam = ieee80211_get_ringparam, .set_ringparam = ieee80211_set_ringparam, .get_strings = ieee80211_get_strings, .get_ethtool_stats = ieee80211_get_stats, .get_sset_count = ieee80211_get_sset_count, };
2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 // SPDX-License-Identifier: GPL-2.0-or-later /* * Information interface for ALSA driver * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/slab.h> #include <linux/time.h> #include <linux/string.h> #include <linux/export.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/info.h> #include <linux/utsname.h> #include <linux/mutex.h> /* * OSS compatible part */ static DEFINE_MUTEX(strings); static char *snd_sndstat_strings[SNDRV_CARDS][SNDRV_OSS_INFO_DEV_COUNT]; int snd_oss_info_register(int dev, int num, char *string) { char *x; if (snd_BUG_ON(dev < 0 || dev >= SNDRV_OSS_INFO_DEV_COUNT)) return -ENXIO; if (snd_BUG_ON(num < 0 || num >= SNDRV_CARDS)) return -ENXIO; guard(mutex)(&strings); if (string == NULL) { x = snd_sndstat_strings[num][dev]; kfree(x); x = NULL; } else { x = kstrdup(string, GFP_KERNEL); if (x == NULL) return -ENOMEM; } snd_sndstat_strings[num][dev] = x; return 0; } EXPORT_SYMBOL(snd_oss_info_register); static int snd_sndstat_show_strings(struct snd_info_buffer *buf, char *id, int dev) { int idx, ok = -1; char *str; snd_iprintf(buf, "\n%s:", id); guard(mutex)(&strings); for (idx = 0; idx < SNDRV_CARDS; idx++) { str = snd_sndstat_strings[idx][dev]; if (str) { if (ok < 0) { snd_iprintf(buf, "\n"); ok++; } snd_iprintf(buf, "%i: %s\n", idx, str); } } if (ok < 0) snd_iprintf(buf, " NOT ENABLED IN CONFIG\n"); return ok; } static void snd_sndstat_proc_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { snd_iprintf(buffer, "Sound Driver:3.8.1a-980706 (ALSA emulation code)\n"); snd_iprintf(buffer, "Kernel: %s %s %s %s %s\n", init_utsname()->sysname, init_utsname()->nodename, init_utsname()->release, init_utsname()->version, init_utsname()->machine); snd_iprintf(buffer, "Config options: 0\n"); snd_iprintf(buffer, "\nInstalled drivers: \n"); snd_iprintf(buffer, "Type 10: ALSA emulation\n"); snd_iprintf(buffer, "\nCard config: \n"); snd_card_info_read_oss(buffer); snd_sndstat_show_strings(buffer, "Audio devices", SNDRV_OSS_INFO_DEV_AUDIO); snd_sndstat_show_strings(buffer, "Synth devices", SNDRV_OSS_INFO_DEV_SYNTH); snd_sndstat_show_strings(buffer, "Midi devices", SNDRV_OSS_INFO_DEV_MIDI); snd_sndstat_show_strings(buffer, "Timers", SNDRV_OSS_INFO_DEV_TIMERS); snd_sndstat_show_strings(buffer, "Mixers", SNDRV_OSS_INFO_DEV_MIXERS); } int __init snd_info_minor_register(void) { struct snd_info_entry *entry; memset(snd_sndstat_strings, 0, sizeof(snd_sndstat_strings)); entry = snd_info_create_module_entry(THIS_MODULE, "sndstat", snd_oss_root); if (!entry) return -ENOMEM; entry->c.text.read = snd_sndstat_proc_read; return snd_info_register(entry); /* freed in error path */ }
2 2 2 13 13 2 5 10 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 /****************************************************************************** * * Module Name: utalloc - local memory allocation routines * * Copyright (C) 2000 - 2025, Intel Corp. * *****************************************************************************/ #include <acpi/acpi.h> #include "accommon.h" #include "acdebug.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utalloc") #if !defined (USE_NATIVE_ALLOCATE_ZEROED) /******************************************************************************* * * FUNCTION: acpi_os_allocate_zeroed * * PARAMETERS: size - Size of the allocation * * RETURN: Address of the allocated memory on success, NULL on failure. * * DESCRIPTION: Subsystem equivalent of calloc. Allocate and zero memory. * This is the default implementation. Can be overridden via the * USE_NATIVE_ALLOCATE_ZEROED flag. * ******************************************************************************/ void *acpi_os_allocate_zeroed(acpi_size size) { void *allocation; ACPI_FUNCTION_ENTRY(); allocation = acpi_os_allocate(size); if (allocation) { /* Clear the memory block */ memset(allocation, 0, size); } return (allocation); } #endif /* !USE_NATIVE_ALLOCATE_ZEROED */ /******************************************************************************* * * FUNCTION: acpi_ut_create_caches * * PARAMETERS: None * * RETURN: Status * * DESCRIPTION: Create all local caches * ******************************************************************************/ acpi_status acpi_ut_create_caches(void) { acpi_status status; /* Object Caches, for frequently used objects */ status = acpi_os_create_cache("Acpi-Namespace", sizeof(struct acpi_namespace_node), ACPI_MAX_NAMESPACE_CACHE_DEPTH, &acpi_gbl_namespace_cache); if (ACPI_FAILURE(status)) { return (status); } status = acpi_os_create_cache("Acpi-State", sizeof(union acpi_generic_state), ACPI_MAX_STATE_CACHE_DEPTH, &acpi_gbl_state_cache); if (ACPI_FAILURE(status)) { return (status); } status = acpi_os_create_cache("Acpi-Parse", sizeof(struct acpi_parse_obj_common), ACPI_MAX_PARSE_CACHE_DEPTH, &acpi_gbl_ps_node_cache); if (ACPI_FAILURE(status)) { return (status); } status = acpi_os_create_cache("Acpi-ParseExt", sizeof(struct acpi_parse_obj_named), ACPI_MAX_EXTPARSE_CACHE_DEPTH, &acpi_gbl_ps_node_ext_cache); if (ACPI_FAILURE(status)) { return (status); } status = acpi_os_create_cache("Acpi-Operand", sizeof(union acpi_operand_object), ACPI_MAX_OBJECT_CACHE_DEPTH, &acpi_gbl_operand_cache); if (ACPI_FAILURE(status)) { return (status); } #ifdef ACPI_ASL_COMPILER /* * For use with the ASL-/ASL+ option. This cache keeps track of regular * 0xA9 0x01 comments. */ status = acpi_os_create_cache("Acpi-Comment", sizeof(struct acpi_comment_node), ACPI_MAX_COMMENT_CACHE_DEPTH, &acpi_gbl_reg_comment_cache); if (ACPI_FAILURE(status)) { return (status); } /* * This cache keeps track of the starting addresses of where the comments * lie. This helps prevent duplication of comments. */ status = acpi_os_create_cache("Acpi-Comment-Addr", sizeof(struct acpi_comment_addr_node), ACPI_MAX_COMMENT_CACHE_DEPTH, &acpi_gbl_comment_addr_cache); if (ACPI_FAILURE(status)) { return (status); } /* * This cache will be used for nodes that represent files. */ status = acpi_os_create_cache("Acpi-File", sizeof(struct acpi_file_node), ACPI_MAX_COMMENT_CACHE_DEPTH, &acpi_gbl_file_cache); if (ACPI_FAILURE(status)) { return (status); } #endif #ifdef ACPI_DBG_TRACK_ALLOCATIONS /* Memory allocation lists */ status = acpi_ut_create_list("Acpi-Global", 0, &acpi_gbl_global_list); if (ACPI_FAILURE(status)) { return (status); } status = acpi_ut_create_list("Acpi-Namespace", sizeof(struct acpi_namespace_node), &acpi_gbl_ns_node_list); if (ACPI_FAILURE(status)) { return (status); } #endif return (AE_OK); } /******************************************************************************* * * FUNCTION: acpi_ut_delete_caches * * PARAMETERS: None * * RETURN: Status * * DESCRIPTION: Purge and delete all local caches * ******************************************************************************/ acpi_status acpi_ut_delete_caches(void) { #ifdef ACPI_DBG_TRACK_ALLOCATIONS char buffer[7]; if (acpi_gbl_display_final_mem_stats) { strcpy(buffer, "MEMORY"); (void)acpi_db_display_statistics(buffer); } #endif (void)acpi_os_delete_cache(acpi_gbl_namespace_cache); acpi_gbl_namespace_cache = NULL; (void)acpi_os_delete_cache(acpi_gbl_state_cache); acpi_gbl_state_cache = NULL; (void)acpi_os_delete_cache(acpi_gbl_operand_cache); acpi_gbl_operand_cache = NULL; (void)acpi_os_delete_cache(acpi_gbl_ps_node_cache); acpi_gbl_ps_node_cache = NULL; (void)acpi_os_delete_cache(acpi_gbl_ps_node_ext_cache); acpi_gbl_ps_node_ext_cache = NULL; #ifdef ACPI_ASL_COMPILER (void)acpi_os_delete_cache(acpi_gbl_reg_comment_cache); acpi_gbl_reg_comment_cache = NULL; (void)acpi_os_delete_cache(acpi_gbl_comment_addr_cache); acpi_gbl_comment_addr_cache = NULL; (void)acpi_os_delete_cache(acpi_gbl_file_cache); acpi_gbl_file_cache = NULL; #endif #ifdef ACPI_DBG_TRACK_ALLOCATIONS /* Debug only - display leftover memory allocation, if any */ acpi_ut_dump_allocations(ACPI_UINT32_MAX, NULL); /* Free memory lists */ acpi_os_free(acpi_gbl_global_list); acpi_gbl_global_list = NULL; acpi_os_free(acpi_gbl_ns_node_list); acpi_gbl_ns_node_list = NULL; #endif return (AE_OK); } /******************************************************************************* * * FUNCTION: acpi_ut_validate_buffer * * PARAMETERS: buffer - Buffer descriptor to be validated * * RETURN: Status * * DESCRIPTION: Perform parameter validation checks on an struct acpi_buffer * ******************************************************************************/ acpi_status acpi_ut_validate_buffer(struct acpi_buffer *buffer) { /* Obviously, the structure pointer must be valid */ if (!buffer) { return (AE_BAD_PARAMETER); } /* Special semantics for the length */ if ((buffer->length == ACPI_NO_BUFFER) || (buffer->length == ACPI_ALLOCATE_BUFFER) || (buffer->length == ACPI_ALLOCATE_LOCAL_BUFFER)) { return (AE_OK); } /* Length is valid, the buffer pointer must be also */ if (!buffer->pointer) { return (AE_BAD_PARAMETER); } return (AE_OK); } /******************************************************************************* * * FUNCTION: acpi_ut_initialize_buffer * * PARAMETERS: buffer - Buffer to be validated * required_length - Length needed * * RETURN: Status * * DESCRIPTION: Validate that the buffer is of the required length or * allocate a new buffer. Returned buffer is always zeroed. * ******************************************************************************/ acpi_status acpi_ut_initialize_buffer(struct acpi_buffer *buffer, acpi_size required_length) { acpi_size input_buffer_length; /* Parameter validation */ if (!buffer || !required_length) { return (AE_BAD_PARAMETER); } /* * Buffer->Length is used as both an input and output parameter. Get the * input actual length and set the output required buffer length. */ input_buffer_length = buffer->length; buffer->length = required_length; /* * The input buffer length contains the actual buffer length, or the type * of buffer to be allocated by this routine. */ switch (input_buffer_length) { case ACPI_NO_BUFFER: /* Return the exception (and the required buffer length) */ return (AE_BUFFER_OVERFLOW); case ACPI_ALLOCATE_BUFFER: /* * Allocate a new buffer. We directectly call acpi_os_allocate here to * purposefully bypass the (optionally enabled) internal allocation * tracking mechanism since we only want to track internal * allocations. Note: The caller should use acpi_os_free to free this * buffer created via ACPI_ALLOCATE_BUFFER. */ buffer->pointer = acpi_os_allocate(required_length); break; case ACPI_ALLOCATE_LOCAL_BUFFER: /* Allocate a new buffer with local interface to allow tracking */ buffer->pointer = ACPI_ALLOCATE(required_length); break; default: /* Existing buffer: Validate the size of the buffer */ if (input_buffer_length < required_length) { return (AE_BUFFER_OVERFLOW); } break; } /* Validate allocation from above or input buffer pointer */ if (!buffer->pointer) { return (AE_NO_MEMORY); } /* Have a valid buffer, clear it */ memset(buffer->pointer, 0, required_length); return (AE_OK); }
3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 Davidlohr Bueso <davidlohr.bueso@hp.com> * * Based on the shift-and-subtract algorithm for computing integer * square root from Guy L. Steele. */ #include <linux/export.h> #include <linux/bitops.h> #include <linux/limits.h> #include <linux/math.h> /** * int_sqrt - computes the integer square root * @x: integer of which to calculate the sqrt * * Computes: floor(sqrt(x)) */ unsigned long int_sqrt(unsigned long x) { unsigned long b, m, y = 0; if (x <= 1) return x; m = 1UL << (__fls(x) & ~1UL); while (m != 0) { b = y + m; y >>= 1; if (x >= b) { x -= b; y += m; } m >>= 2; } return y; } EXPORT_SYMBOL(int_sqrt); #if BITS_PER_LONG < 64 /** * int_sqrt64 - strongly typed int_sqrt function when minimum 64 bit input * is expected. * @x: 64bit integer of which to calculate the sqrt */ u32 int_sqrt64(u64 x) { u64 b, m, y = 0; if (x <= ULONG_MAX) return int_sqrt((unsigned long) x); m = 1ULL << ((fls64(x) - 1) & ~1ULL); while (m != 0) { b = y + m; y >>= 1; if (x >= b) { x -= b; y += m; } m >>= 2; } return y; } EXPORT_SYMBOL(int_sqrt64); #endif
3159 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 /* SPDX-License-Identifier: GPL-2.0 */ /* * * Definitions for mount interface. This describes the in the kernel build * linkedlist with mounted filesystems. * * Author: Marco van Wieringen <mvw@planets.elm.net> * */ #ifndef _LINUX_MOUNT_H #define _LINUX_MOUNT_H #include <linux/types.h> #include <asm/barrier.h> struct super_block; struct dentry; struct user_namespace; struct mnt_idmap; struct file_system_type; struct fs_context; struct file; struct path; enum mount_flags { MNT_NOSUID = 0x01, MNT_NODEV = 0x02, MNT_NOEXEC = 0x04, MNT_NOATIME = 0x08, MNT_NODIRATIME = 0x10, MNT_RELATIME = 0x20, MNT_READONLY = 0x40, /* does the user want this to be r/o? */ MNT_NOSYMFOLLOW = 0x80, MNT_SHRINKABLE = 0x100, MNT_INTERNAL = 0x4000, MNT_LOCK_ATIME = 0x040000, MNT_LOCK_NOEXEC = 0x080000, MNT_LOCK_NOSUID = 0x100000, MNT_LOCK_NODEV = 0x200000, MNT_LOCK_READONLY = 0x400000, MNT_LOCKED = 0x800000, MNT_DOOMED = 0x1000000, MNT_SYNC_UMOUNT = 0x2000000, MNT_UMOUNT = 0x8000000, MNT_USER_SETTABLE_MASK = MNT_NOSUID | MNT_NODEV | MNT_NOEXEC | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME | MNT_READONLY | MNT_NOSYMFOLLOW, MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME, MNT_INTERNAL_FLAGS = MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_LOCKED }; struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ int mnt_flags; struct mnt_idmap *mnt_idmap; } __randomize_layout; static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) { /* Pairs with smp_store_release() in do_idmap_mount(). */ return READ_ONCE(mnt->mnt_idmap); } extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write_file(struct file *file); extern void mnt_drop_write(struct vfsmount *mnt); extern void mnt_drop_write_file(struct file *file); extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); extern void mnt_make_shortterm(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(const struct path *path); extern bool __mnt_is_readonly(const struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); extern struct vfsmount *clone_private_mount(const struct path *path); int mnt_get_write_access(struct vfsmount *mnt); void mnt_put_write_access(struct vfsmount *mnt); extern struct vfsmount *fc_mount(struct fs_context *fc); extern struct vfsmount *fc_mount_longterm(struct fs_context *fc); extern struct vfsmount *vfs_create_mount(struct fs_context *fc); extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data); extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); extern bool path_is_mountpoint(const struct path *path); extern bool our_mnt(struct vfsmount *mnt); extern struct vfsmount *kern_mount(struct file_system_type *); extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); extern const struct path *collect_paths(const struct path *, struct path *, unsigned); extern void drop_collected_paths(const struct path *, const struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); #endif /* _LINUX_MOUNT_H */
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NET_TIMESTAMPING_H_ #define _LINUX_NET_TIMESTAMPING_H_ #include <uapi/linux/net_tstamp.h> #include <uapi/linux/ethtool_netlink_generated.h> #define SOF_TIMESTAMPING_SOFTWARE_MASK (SOF_TIMESTAMPING_RX_SOFTWARE | \ SOF_TIMESTAMPING_TX_SOFTWARE | \ SOF_TIMESTAMPING_SOFTWARE) #define SOF_TIMESTAMPING_HARDWARE_MASK (SOF_TIMESTAMPING_RX_HARDWARE | \ SOF_TIMESTAMPING_TX_HARDWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) /** * struct hwtstamp_provider_desc - hwtstamp provider description * * @index: index of the hwtstamp provider. * @qualifier: hwtstamp provider qualifier. */ struct hwtstamp_provider_desc { int index; enum hwtstamp_provider_qualifier qualifier; }; /** * struct hwtstamp_provider - hwtstamp provider object * * @rcu_head: RCU callback used to free the struct. * @source: source of the hwtstamp provider. * @phydev: pointer of the phydev source in case a PTP coming from phylib * @desc: hwtstamp provider description. */ struct hwtstamp_provider { struct rcu_head rcu_head; enum hwtstamp_source source; struct phy_device *phydev; struct hwtstamp_provider_desc desc; }; /** * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config * * @flags: see struct hwtstamp_config * @tx_type: see struct hwtstamp_config * @rx_filter: see struct hwtstamp_config * @ifr: pointer to ifreq structure from the original ioctl request, to pass to * a legacy implementation of a lower driver * @copied_to_user: request was passed to a legacy implementation which already * copied the ioctl request back to user space * @source: indication whether timestamps should come from the netdev or from * an attached phylib PHY * @qualifier: qualifier of the hwtstamp provider * * Prefer using this structure for in-kernel processing of hardware * timestamping configuration, over the inextensible struct hwtstamp_config * exposed to the %SIOCGHWTSTAMP and %SIOCSHWTSTAMP ioctl UAPI. */ struct kernel_hwtstamp_config { int flags; int tx_type; int rx_filter; struct ifreq *ifr; bool copied_to_user; enum hwtstamp_source source; enum hwtstamp_provider_qualifier qualifier; }; static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, const struct hwtstamp_config *cfg) { kernel_cfg->flags = cfg->flags; kernel_cfg->tx_type = cfg->tx_type; kernel_cfg->rx_filter = cfg->rx_filter; } static inline void hwtstamp_config_from_kernel(struct hwtstamp_config *cfg, const struct kernel_hwtstamp_config *kernel_cfg) { cfg->flags = kernel_cfg->flags; cfg->tx_type = kernel_cfg->tx_type; cfg->rx_filter = kernel_cfg->rx_filter; } static inline bool kernel_hwtstamp_config_changed(const struct kernel_hwtstamp_config *a, const struct kernel_hwtstamp_config *b) { return a->flags != b->flags || a->tx_type != b->tx_type || a->rx_filter != b->rx_filter; } #endif /* _LINUX_NET_TIMESTAMPING_H_ */
7 4 12 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 12 12 12 7 7 7 7 7 7 19 19 19 7 7 7 12 7 12 11 12 12 12 12 12 12 12 7 5 7 7 6 7 6 7 7 7 7 7 7 7 7 7 5 5 5 1 1 1 1 4 4 4 4 4 4 4 4 4 7 7 23 22 23 23 7 7 7 6 7 7 7 7 7 7 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 977 978 29 7 29 28 7 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. #include <linux/ethtool_netlink.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/udp_tunnel.h> #include <net/vxlan.h> enum udp_tunnel_nic_table_entry_flags { UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0), UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1), UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2), UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3), }; struct udp_tunnel_nic_table_entry { __be16 port; u8 type; u8 flags; u16 use_cnt; #define UDP_TUNNEL_NIC_USE_CNT_MAX U16_MAX u8 hw_priv; }; /** * struct udp_tunnel_nic - UDP tunnel port offload state * @work: async work for talking to hardware from process context * @dev: netdev pointer * @lock: protects all fields * @need_sync: at least one port start changed * @need_replay: space was freed, we need a replay of all ports * @work_pending: @work is currently scheduled * @n_tables: number of tables under @entries * @missed: bitmap of tables which overflown * @entries: table of tables of ports currently offloaded */ struct udp_tunnel_nic { struct work_struct work; struct net_device *dev; struct mutex lock; u8 need_sync:1; u8 need_replay:1; u8 work_pending:1; unsigned int n_tables; unsigned long missed; struct udp_tunnel_nic_table_entry *entries[] __counted_by(n_tables); }; /* We ensure all work structs are done using driver state, but not the code. * We need a workqueue we can flush before module gets removed. */ static struct workqueue_struct *udp_tunnel_nic_workqueue; static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type) { switch (type) { case UDP_TUNNEL_TYPE_VXLAN: return "vxlan"; case UDP_TUNNEL_TYPE_GENEVE: return "geneve"; case UDP_TUNNEL_TYPE_VXLAN_GPE: return "vxlan-gpe"; default: return "unknown"; } } static bool udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt == 0 && !entry->flags; } static bool udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); } static bool udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry) { if (!udp_tunnel_nic_entry_is_free(entry)) entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry) { entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN; } static bool udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD | UDP_TUNNEL_NIC_ENTRY_DEL); } static void udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn, struct udp_tunnel_nic_table_entry *entry, unsigned int flag) { entry->flags |= flag; utn->need_sync = 1; } static void udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry, struct udp_tunnel_info *ti) { memset(ti, 0, sizeof(*ti)); ti->port = entry->port; ti->type = entry->type; ti->hw_priv = entry->hw_priv; } static bool udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return false; return true; } static bool udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; if (!utn->missed) return false; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!test_bit(i, &utn->missed)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return true; } return false; } static void __udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; entry = &utn->entries[table][idx]; if (entry->use_cnt) udp_tunnel_nic_ti_from_entry(entry, ti); } static void __udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv; } static void udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry, int err) { bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && (!err || (err == -EEXIST && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD; if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL && (!err || (err == -ENOENT && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL; if (!err) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL; else entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL; } static void udp_tunnel_nic_device_sync_one(struct net_device *dev, struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_info ti; int err; entry = &utn->entries[table][idx]; if (!udp_tunnel_nic_entry_is_queued(entry)) return; udp_tunnel_nic_ti_from_entry(entry, &ti); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD) err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti); else err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx, &ti); udp_tunnel_nic_entry_update_done(entry, err); if (err) netdev_warn(dev, "UDP tunnel port sync failed port %d type %s: %d\n", be16_to_cpu(entry->port), udp_tunnel_nic_tunnel_type_name(entry->type), err); } static void udp_tunnel_nic_device_sync_by_port(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_device_sync_one(dev, utn, i, j); } static void udp_tunnel_nic_device_sync_by_table(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; int err; for (i = 0; i < utn->n_tables; i++) { /* Find something that needs sync in this table */ for (j = 0; j < info->tables[i].n_entries; j++) if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j])) break; if (j == info->tables[i].n_entries) continue; err = info->sync_table(dev, i); if (err) netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n", i, err); for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (udp_tunnel_nic_entry_is_queued(entry)) udp_tunnel_nic_entry_update_done(entry, err); } } } static void __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { if (!utn->need_sync) return; if (dev->udp_tunnel_nic_info->sync_table) udp_tunnel_nic_device_sync_by_table(dev, utn); else udp_tunnel_nic_device_sync_by_port(dev, utn); utn->need_sync = 0; /* Can't replay directly here, in case we come from the tunnel driver's * notification - trying to replay may deadlock inside tunnel driver. */ utn->need_replay = udp_tunnel_nic_should_replay(dev, utn); } static void udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { if (!utn->need_sync) return; queue_work(udp_tunnel_nic_workqueue, &utn->work); utn->work_pending = 1; } static bool udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table, struct udp_tunnel_info *ti) { return table->tunnel_types & ti->type; } static bool udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i; /* Special case IPv4-only NICs */ if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY && ti->sa_family != AF_INET) return false; for (i = 0; i < utn->n_tables; i++) if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti)) return true; return false; } static int udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_table_entry *entry; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry) && entry->port == ti->port && entry->type != ti->type) { __set_bit(i, &utn->missed); return true; } } return false; } static void udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; unsigned int from, to; WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX); /* If not going from used to unused or vice versa - all done. * For dodgy entries make sure we try to sync again (queue the entry). */ entry->use_cnt += use_cnt_adj; if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj)) return; /* Cancel the op before it was sent to the device, if possible, * otherwise we'd need to take special care to issue commands * in the same order the ports arrived. */ if (use_cnt_adj < 0) { from = UDP_TUNNEL_NIC_ENTRY_ADD; to = UDP_TUNNEL_NIC_ENTRY_DEL; } else { from = UDP_TUNNEL_NIC_ENTRY_DEL; to = UDP_TUNNEL_NIC_ENTRY_ADD; } if (entry->flags & from) { entry->flags &= ~from; if (!dodgy) return; } udp_tunnel_nic_entry_queue(utn, entry, to); } static bool udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; if (udp_tunnel_nic_entry_is_free(entry) || entry->port != ti->port || entry->type != ti->type) return false; if (udp_tunnel_nic_entry_is_frozen(entry)) return true; udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj); return true; } /* Try to find existing matching entry and adjust its use count, instead of * adding a new one. Returns true if entry was found. In case of delete the * entry may have gotten removed in the process, in which case it will be * queued for removal. */ static bool udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti, int use_cnt_adj) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti, use_cnt_adj)) return true; } return false; } static bool udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, +1); } static bool udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, -1); } static bool udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry)) continue; entry->port = ti->port; entry->type = ti->type; entry->use_cnt = 1; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); return true; } /* The different table may still fit this port in, but there * are no devices currently which have multiple tables accepting * the same tunnel type, and false positives are okay. */ __set_bit(i, &utn->missed); } return false; } static void __udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY) return; if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN && ti->port == htons(IANA_VXLAN_UDP_PORT)) { if (ti->type != UDP_TUNNEL_TYPE_VXLAN) netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n"); return; } if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; /* It may happen that a tunnel of one type is removed and different * tunnel type tries to reuse its port before the device was informed. * Rely on utn->missed to re-add this port later. */ if (udp_tunnel_nic_has_collision(dev, utn, ti)) return; if (!udp_tunnel_nic_add_existing(dev, utn, ti)) udp_tunnel_nic_add_new(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) { struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; udp_tunnel_nic_del_existing(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int i, j; utn = dev->udp_tunnel_nic; if (!utn) return; mutex_lock(&utn->lock); utn->need_sync = false; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | UDP_TUNNEL_NIC_ENTRY_OP_FAIL); /* We don't release utn lock across ops */ WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); if (!entry->use_cnt) continue; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); } __udp_tunnel_nic_device_sync(dev, utn); mutex_unlock(&utn->lock); } static size_t __udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int j; size_t size; utn = dev->udp_tunnel_nic; if (!utn) return 0; size = 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; size += nla_total_size(0) + /* _TABLE_ENTRY */ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ } return size; } static int __udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; struct nlattr *nest; unsigned int j; utn = dev->udp_tunnel_nic; if (!utn) return 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); if (!nest) return -EMSGSIZE; if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, utn->entries[table][j].port) || nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, ilog2(utn->entries[table][j].type))) goto err_cancel; nla_nest_end(skb, nest); } return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void __udp_tunnel_nic_assert_locked(struct net_device *dev) { struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (utn) lockdep_assert_held(&utn->lock); } static void __udp_tunnel_nic_lock(struct net_device *dev) { struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (utn) mutex_lock(&utn->lock); } static void __udp_tunnel_nic_unlock(struct net_device *dev) { struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (utn) mutex_unlock(&utn->lock); } static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .get_port = __udp_tunnel_nic_get_port, .set_port_priv = __udp_tunnel_nic_set_port_priv, .add_port = __udp_tunnel_nic_add_port, .del_port = __udp_tunnel_nic_del_port, .reset_ntf = __udp_tunnel_nic_reset_ntf, .dump_size = __udp_tunnel_nic_dump_size, .dump_write = __udp_tunnel_nic_dump_write, .assert_locked = __udp_tunnel_nic_assert_locked, .lock = __udp_tunnel_nic_lock, .unlock = __udp_tunnel_nic_unlock, }; static void udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { int adj_cnt = -utn->entries[i][j].use_cnt; if (adj_cnt) udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt); } __udp_tunnel_nic_device_sync(dev, utn); for (i = 0; i < utn->n_tables; i++) memset(utn->entries[i], 0, array_size(info->tables[i].n_entries, sizeof(**utn->entries))); WARN_ON(utn->need_sync); utn->need_replay = 0; } static void udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node; unsigned int i, j; /* Freeze all the ports we are already tracking so that the replay * does not double up the refcount. */ for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]); utn->missed = 0; utn->need_replay = 0; if (!info->shared) { udp_tunnel_get_rx_info(dev); } else { list_for_each_entry(node, &info->shared->devices, list) udp_tunnel_get_rx_info(node->dev); } for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]); } static void udp_tunnel_nic_device_sync_work(struct work_struct *work) { struct udp_tunnel_nic *utn = container_of(work, struct udp_tunnel_nic, work); rtnl_lock(); mutex_lock(&utn->lock); utn->work_pending = 0; __udp_tunnel_nic_device_sync(utn->dev, utn); if (utn->need_replay) udp_tunnel_nic_replay(utn->dev, utn); mutex_unlock(&utn->lock); rtnl_unlock(); } static struct udp_tunnel_nic * udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, unsigned int n_tables) { struct udp_tunnel_nic *utn; unsigned int i; utn = kzalloc(struct_size(utn, entries, n_tables), GFP_KERNEL); if (!utn) return NULL; utn->n_tables = n_tables; INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); mutex_init(&utn->lock); for (i = 0; i < n_tables; i++) { utn->entries[i] = kcalloc(info->tables[i].n_entries, sizeof(*utn->entries[i]), GFP_KERNEL); if (!utn->entries[i]) goto err_free_prev_entries; } return utn; err_free_prev_entries: while (i--) kfree(utn->entries[i]); kfree(utn); return NULL; } static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn) { unsigned int i; for (i = 0; i < utn->n_tables; i++) kfree(utn->entries[i]); kfree(utn); } static int udp_tunnel_nic_register(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node = NULL; struct udp_tunnel_nic *utn; unsigned int n_tables, i; BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE < UDP_TUNNEL_NIC_MAX_TABLES); /* Expect use count of at most 2 (IPv4, IPv6) per device */ BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX < UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2); /* Check that the driver info is sane */ if (WARN_ON(!info->set_port != !info->unset_port) || WARN_ON(!info->set_port == !info->sync_table) || WARN_ON(!info->tables[0].n_entries)) return -EINVAL; if (WARN_ON(info->shared && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return -EINVAL; n_tables = 1; for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { if (!info->tables[i].n_entries) continue; n_tables++; if (WARN_ON(!info->tables[i - 1].n_entries)) return -EINVAL; } /* Create UDP tunnel state structures */ if (info->shared) { node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; node->dev = dev; } if (info->shared && info->shared->udp_tunnel_nic_info) { utn = info->shared->udp_tunnel_nic_info; } else { utn = udp_tunnel_nic_alloc(info, n_tables); if (!utn) { kfree(node); return -ENOMEM; } } if (info->shared) { if (!info->shared->udp_tunnel_nic_info) { INIT_LIST_HEAD(&info->shared->devices); info->shared->udp_tunnel_nic_info = utn; } list_add_tail(&node->list, &info->shared->devices); } utn->dev = dev; dev_hold(dev); dev->udp_tunnel_nic = utn; if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) { udp_tunnel_nic_lock(dev); udp_tunnel_get_rx_info(dev); udp_tunnel_nic_unlock(dev); } return 0; } static void udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; udp_tunnel_nic_lock(dev); /* For a shared table remove this dev from the list of sharing devices * and if there are other devices just detach. */ if (info->shared) { struct udp_tunnel_nic_shared_node *node, *first; list_for_each_entry(node, &info->shared->devices, list) if (node->dev == dev) break; if (list_entry_is_head(node, &info->shared->devices, list)) { udp_tunnel_nic_unlock(dev); return; } list_del(&node->list); kfree(node); first = list_first_entry_or_null(&info->shared->devices, typeof(*first), list); if (first) { udp_tunnel_drop_rx_info(dev); utn->dev = first->dev; udp_tunnel_nic_unlock(dev); goto release_dev; } info->shared->udp_tunnel_nic_info = NULL; } /* Flush before we check work, so we don't waste time adding entries * from the work which we will boot immediately. */ udp_tunnel_nic_flush(dev, utn); udp_tunnel_nic_unlock(dev); /* Wait for the work to be done using the state, netdev core will * retry unregister until we give up our reference on this device. */ if (utn->work_pending) return; udp_tunnel_nic_free(utn); release_dev: dev->udp_tunnel_nic = NULL; dev_put(dev); } static int udp_tunnel_nic_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); const struct udp_tunnel_nic_info *info; struct udp_tunnel_nic *utn; info = dev->udp_tunnel_nic_info; if (!info) return NOTIFY_DONE; if (event == NETDEV_REGISTER) { int err; err = udp_tunnel_nic_register(dev); if (err) netdev_warn(dev, "failed to register for UDP tunnel offloads: %d", err); return notifier_from_errno(err); } /* All other events will need the udp_tunnel_nic state */ utn = dev->udp_tunnel_nic; if (!utn) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { udp_tunnel_nic_unregister(dev, utn); return NOTIFY_OK; } /* All other events only matter if NIC has to be programmed open */ if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return NOTIFY_DONE; if (event == NETDEV_UP) { udp_tunnel_nic_lock(dev); WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); udp_tunnel_get_rx_info(dev); udp_tunnel_nic_unlock(dev); return NOTIFY_OK; } if (event == NETDEV_GOING_DOWN) { udp_tunnel_nic_lock(dev); udp_tunnel_nic_flush(dev, utn); udp_tunnel_nic_unlock(dev); return NOTIFY_OK; } return NOTIFY_DONE; } static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = { .notifier_call = udp_tunnel_nic_netdevice_event, }; static int __init udp_tunnel_nic_init_module(void) { int err; udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0); if (!udp_tunnel_nic_workqueue) return -ENOMEM; rtnl_lock(); udp_tunnel_nic_ops = &__udp_tunnel_nic_ops; rtnl_unlock(); err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block); if (err) goto err_unset_ops; return 0; err_unset_ops: rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); return err; } late_initcall(udp_tunnel_nic_init_module); static void __exit udp_tunnel_nic_cleanup_module(void) { unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block); rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); } module_exit(udp_tunnel_nic_cleanup_module); MODULE_LICENSE("GPL");
3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 1 1 1 1 1 1 3 3 13209 13209 2 2 1 2 2 2 2 2 3 10 3 1 3 10 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 // SPDX-License-Identifier: GPL-2.0-only /* * xsave/xrstor support. * * Author: Suresh Siddha <suresh.b.siddha@intel.com> */ #include <linux/bitops.h> #include <linux/compat.h> #include <linux/cpu.h> #include <linux/mman.h> #include <linux/kvm_types.h> #include <linux/nospec.h> #include <linux/pkeys.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/coredump.h> #include <linux/sort.h> #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/xcr.h> #include <asm/cpuid/api.h> #include <asm/msr.h> #include <asm/tlbflush.h> #include <asm/prctl.h> #include <asm/elf.h> #include <uapi/asm/elf.h> #include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" #define for_each_extended_xfeature(bit, mask) \ (bit) = FIRST_EXTENDED_XFEATURE; \ for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) /* * Although we spell it out in here, the Processor Trace * xfeature is completely unused. We use other mechanisms * to save/restore PT state in Linux. */ static const char *xfeature_names[] = { "x87 floating point registers", "SSE registers", "AVX registers", "MPX bounds registers", "MPX CSR", "AVX-512 opmask", "AVX-512 Hi256", "AVX-512 ZMM_Hi256", "Processor Trace (unused)", "Protection Keys User registers", "PASID state", "Control-flow User registers", "Control-flow Kernel registers (KVM only)", "unknown xstate feature", "unknown xstate feature", "unknown xstate feature", "unknown xstate feature", "AMX Tile config", "AMX Tile data", "APX registers", "unknown xstate feature", }; static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_FP] = X86_FEATURE_FPU, [XFEATURE_SSE] = X86_FEATURE_XMM, [XFEATURE_YMM] = X86_FEATURE_AVX, [XFEATURE_BNDREGS] = X86_FEATURE_MPX, [XFEATURE_BNDCSR] = X86_FEATURE_MPX, [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, [XFEATURE_PKRU] = X86_FEATURE_OSPKE, [XFEATURE_PASID] = X86_FEATURE_ENQCMD, [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, [XFEATURE_CET_KERNEL] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, [XFEATURE_APX] = X86_FEATURE_APX, }; static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; /* * Ordering of xstate components in uncompacted format: The xfeature * number does not necessarily indicate its position in the XSAVE buffer. * This array defines the traversal order of xstate features. */ static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static inline unsigned int next_xfeature_order(unsigned int i, u64 mask) { for (; xfeature_uncompact_order[i] != -1; i++) { if (mask & BIT_ULL(xfeature_uncompact_order[i])) break; } return i; } /* Iterate xstate features in uncompacted order: */ #define for_each_extended_xfeature_in_order(i, mask) \ for (i = 0; \ i = next_xfeature_order(i, mask), \ xfeature_uncompact_order[i] != -1; \ i++) #define XSTATE_FLAG_SUPERVISOR BIT(0) #define XSTATE_FLAG_ALIGNED64 BIT(1) /* * Return whether the system supports a given xfeature. * * Also return the name of the (most advanced) feature that the caller requested: */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; if (unlikely(feature_name)) { long xfeature_idx, max_idx; u64 xfeatures_print; /* * So we use FLS here to be able to print the most advanced * feature that was requested but is missing. So if a driver * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the * missing AVX feature - this is the most informative message * to users: */ if (xfeatures_missing) xfeatures_print = xfeatures_missing; else xfeatures_print = xfeatures_needed; xfeature_idx = fls64(xfeatures_print)-1; max_idx = ARRAY_SIZE(xfeature_names)-1; xfeature_idx = min(xfeature_idx, max_idx); *feature_name = xfeature_names[xfeature_idx]; } if (xfeatures_missing) return 0; return 1; } EXPORT_SYMBOL_GPL(cpu_has_xfeatures); static bool xfeature_is_aligned64(int xfeature_nr) { return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; } static bool xfeature_is_supervisor(int xfeature_nr) { return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; } static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) { unsigned int offs, i; /* * Non-compacted format and legacy features use the cached fixed * offsets. */ if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || xfeature <= XFEATURE_SSE) return xstate_offsets[xfeature]; /* * Compacted format offsets depend on the actual content of the * compacted xsave area which is determined by the xcomp_bv header * field. */ offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; for_each_extended_xfeature(i, xcomp_bv) { if (xfeature_is_aligned64(i)) offs = ALIGN(offs, 64); if (i == xfeature) break; offs += xstate_sizes[i]; } return offs; } /* * Enable the extended processor state save/restore feature. * Called once per CPU onlining. */ void fpu__init_cpu_xstate(void) { if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) return; cr4_set_bits(X86_CR4_OSXSAVE); /* * Must happen after CR4 setup and before xsetbv() to allow KVM * lazy passthrough. Write independent of the dynamic state static * key as that does not work on the boot CPU. This also ensures * that any stale state is wiped out from XFD. Reset the per CPU * xfd cache too. */ if (cpu_feature_enabled(X86_FEATURE_XFD)) xfd_set_state(init_fpstate.xfd); /* * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user * states can be set here. */ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * MSR_IA32_XSS sets supervisor states managed by XSAVES. */ if (boot_cpu_has(X86_FEATURE_XSAVES)) { wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } } static bool xfeature_enabled(enum xfeature xfeature) { return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2) { return xstate_offsets[*(unsigned int *)xfeature1] - xstate_offsets[*(unsigned int *)xfeature2]; } /* * Record the offsets and sizes of various xstates contained * in the XSAVE state memory layout. Also, create an ordered * list of xfeatures for handling out-of-order offsets. */ static void __init setup_xstate_cache(void) { u32 eax, ebx, ecx, edx, xfeature, i = 0; /* * The FP xstates and SSE xstates are legacy states. They are always * in the fixed offsets in the xsave area in either compacted form * or standard form. */ xstate_offsets[XFEATURE_FP] = 0; xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, xmm_space); xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) { cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx); xstate_sizes[xfeature] = eax; xstate_flags[xfeature] = ecx; /* * If an xfeature is supervisor state, the offset in EBX is * invalid, leave it to -1. */ if (xfeature_is_supervisor(xfeature)) continue; xstate_offsets[xfeature] = ebx; /* Populate the list of xfeatures before sorting */ xfeature_uncompact_order[i++] = xfeature; } /* * Sort xfeatures by their offsets to support out-of-order * offsets in the uncompacted format. */ sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL); } /* * Print out all the supported xstate features: */ static void __init print_xstate_features(void) { int i; for (i = 0; i < XFEATURE_MAX; i++) { u64 mask = BIT_ULL(i); const char *name; if (cpu_has_xfeatures(mask, &name)) pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name); } } /* * This check is important because it is easy to get XSTATE_* * confused with XSTATE_BIT_*. */ #define CHECK_XFEATURE(nr) do { \ WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ WARN_ON(nr >= XFEATURE_MAX); \ } while (0) /* * Print out xstate component offsets and sizes */ static void __init print_xstate_offset_size(void) { int i; for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, xfeature_get_offset(fpu_kernel_cfg.max_features, i), i, xstate_sizes[i]); } } /* * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ static __init void os_xrstor_booting(struct xregs_state *xstate) { u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; u32 lmask = mask; u32 hmask = mask >> 32; int err; if (cpu_feature_enabled(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); /* * We should never fault when copying from a kernel buffer, and the FPU * state we set at boot time should be valid. */ WARN_ON_FPU(err); } /* * All supported features have either init state all zeros or are * handled in setup_init_fpu() individually. This is an explicit * feature list and does not use XFEATURE_MASK*SUPPORTED to catch * newly added supported features at build time and make people * actually look at the init state for the new feature. */ #define XFEATURES_INIT_FPSTATE_HANDLED \ (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ XFEATURE_MASK_YMM | \ XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM | \ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER | \ XFEATURE_MASK_CET_KERNEL | \ XFEATURE_MASK_XTILE | \ XFEATURE_MASK_APX) /* * setup the xstate image representing the init state */ static void __init setup_init_fpu_buf(void) { BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED) != XFEATURES_INIT_FPSTATE_HANDLED); if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; print_xstate_features(); xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); /* * Init all the features state with header.xfeatures being 0x0 */ os_xrstor_booting(&init_fpstate.regs.xsave); /* * All components are now in init state. Read the state back so * that init_fpstate contains all non-zero init state. This only * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because * those use the init optimization which skips writing data for * components in init state. * * XSAVE could be used, but that would require to reshuffle the * data when XSAVEC/S is available because XSAVEC/S uses xstate * compaction. But doing so is a pointless exercise because most * components have an all zeros init state except for the legacy * ones (FP and SSE). Those can be saved with FXSAVE into the * legacy area. Adding new features requires to ensure that init * state is all zeroes or if not to add the necessary handling * here. */ fxsave(&init_fpstate.regs.fxsave); } int xfeature_size(int xfeature_nr) { u32 eax, ebx, ecx, edx; CHECK_XFEATURE(xfeature_nr); cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx); return eax; } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ static int validate_user_xstate_header(const struct xstate_header *hdr, struct fpstate *fpstate) { /* No unknown or supervisor features may be set */ if (hdr->xfeatures & ~fpstate->user_xfeatures) return -EINVAL; /* Userspace must use the uncompacted format */ if (hdr->xcomp_bv) return -EINVAL; /* * If 'reserved' is shrunken to add a new field, make sure to validate * that new field here! */ BUILD_BUG_ON(sizeof(hdr->reserved) != 48); /* No reserved bits may be set */ if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) return -EINVAL; return 0; } static void __init __xstate_dump_leaves(void) { int i; u32 eax, ebx, ecx, edx; static int should_dump = 1; if (!should_dump) return; should_dump = 0; /* * Dump out a few leaves past the ones that we support * just in case there are some goodies up there */ for (i = 0; i < XFEATURE_MAX + 10; i++) { cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx); } } #define XSTATE_WARN_ON(x, fmt, ...) do { \ if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ __xstate_dump_leaves(); \ } \ } while (0) #define XCHECK_SZ(sz, nr, __struct) ({ \ if (WARN_ONCE(sz != sizeof(__struct), \ "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ xfeature_names[nr], sizeof(__struct), sz)) { \ __xstate_dump_leaves(); \ } \ true; \ }) /** * check_xtile_data_against_struct - Check tile data state size. * * Calculate the state size by multiplying the single tile size which is * recorded in a C struct, and the number of tiles that the CPU informs. * Compare the provided size with the calculation. * * @size: The tile data state size * * Returns: 0 on success, -EINVAL on mismatch. */ static int __init check_xtile_data_against_struct(int size) { u32 max_palid, palid, state_size; u32 eax, ebx, ecx, edx; u16 max_tile; /* * Check the maximum palette id: * eax: the highest numbered palette subleaf. */ cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx); /* * Cross-check each tile size and find the maximum number of * supported tiles. */ for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { u16 tile_size, max; /* * Check the tile size info: * eax[31:16]: bytes per title * ebx[31:16]: the max names (or max number of tiles) */ cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx); tile_size = eax >> 16; max = ebx >> 16; if (tile_size != sizeof(struct xtile_data)) { pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", __stringify(XFEATURE_XTILE_DATA), sizeof(struct xtile_data), tile_size); __xstate_dump_leaves(); return -EINVAL; } if (max > max_tile) max_tile = max; } state_size = sizeof(struct xtile_data) * max_tile; if (size != state_size) { pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", __stringify(XFEATURE_XTILE_DATA), state_size, size); __xstate_dump_leaves(); return -EINVAL; } return 0; } /* * We have a C struct for each 'xstate'. We need to ensure * that our software representation matches what the CPU * tells us about the state's size. */ static bool __init check_xstate_against_struct(int nr) { /* * Ask the CPU for the size of the state. */ int sz = xfeature_size(nr); /* * Match each CPU state with the corresponding software * structure. */ switch (nr) { case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state); case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state); case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; default: XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); return false; } return true; } static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) { unsigned int topmost = fls64(xfeatures) - 1; unsigned int offset, i; if (topmost <= XFEATURE_SSE) return sizeof(struct xregs_state); if (compacted) { offset = xfeature_get_offset(xfeatures, topmost); } else { /* Walk through the xfeature order to pick the last */ for_each_extended_xfeature_in_order(i, xfeatures) topmost = xfeature_uncompact_order[i]; offset = xstate_offsets[topmost]; } return offset + xstate_sizes[topmost]; } /* * This essentially double-checks what the cpu told us about * how large the XSAVE buffer needs to be. We are recalculating * it to be safe. * * Independent XSAVE features allocate their own buffers and are not * covered by these checks. Only the size of the buffer for task->fpu * is checked here. */ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { if (!check_xstate_against_struct(i)) return false; /* * Supervisor state components can be managed only by * XSAVES. */ if (!xsaves && xfeature_is_supervisor(i)) { XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i); return false; } } size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); XSTATE_WARN_ON(size != kernel_size, "size %u != kernel_size %u\n", size, kernel_size); return size == kernel_size; } /* * Get total size of enabled xstates in XCR0 | IA32_XSS. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. * * This also takes compaction into account. So this works for * XSAVEC as well. */ static unsigned int __init get_compacted_size(void) { unsigned int eax, ebx, ecx, edx; /* * - CPUID function 0DH, sub-function 1: * EBX enumerates the size (in bytes) required by * the XSAVES instruction for an XSAVE area * containing all the state components * corresponding to bits currently set in * XCR0 | IA32_XSS. * * When XSAVES is not available but XSAVEC is (virt), then there * are no supervisor states, but XSAVEC still uses compacted * format. */ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); return ebx; } /* * Get the total size of the enabled xstates without the independent supervisor * features. */ static unsigned int __init get_xsave_compacted_size(void) { u64 mask = xfeatures_mask_independent(); unsigned int size; if (!mask) return get_compacted_size(); /* Disable independent features. */ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor()); /* * Ask the hardware what size is required of the buffer. * This is the size required for the task->fpu buffer. */ size = get_compacted_size(); /* Re-enable independent features so XSAVES will work on them again. */ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); return size; } static unsigned int __init get_xsave_size_user(void) { unsigned int eax, ebx, ecx, edx; /* * - CPUID function 0DH, sub-function 0: * EBX enumerates the size (in bytes) required by * the XSAVE instruction for an XSAVE area * containing all the *user* state components * corresponding to bits currently set in XCR0. */ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); return ebx; } static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ unsigned int user_size, kernel_size, kernel_default_size; bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); /* Uncompacted user space size */ user_size = get_xsave_size_user(); /* * XSAVES kernel size includes supervisor states and uses compacted * format. XSAVEC uses compacted format, but does not save * supervisor states. * * XSAVE[OPT] do not support supervisor states so kernel and user * size is identical. */ if (compacted) kernel_size = get_xsave_compacted_size(); else kernel_size = user_size; kernel_default_size = xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); if (!paranoid_xstate_size_valid(kernel_size)) return -EINVAL; fpu_kernel_cfg.max_size = kernel_size; fpu_user_cfg.max_size = user_size; fpu_kernel_cfg.default_size = kernel_default_size; fpu_user_cfg.default_size = xstate_calculate_size(fpu_user_cfg.default_features, false); guest_default_cfg.size = xstate_calculate_size(guest_default_cfg.features, compacted); return 0; } /* * We enabled the XSAVE hardware, but something went wrong and * we can not use it. Disable it. */ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { pr_info("x86/fpu: XSAVE disabled\n"); fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); /* Restore the legacy size.*/ fpu_kernel_cfg.max_size = legacy_size; fpu_kernel_cfg.default_size = legacy_size; fpu_user_cfg.max_size = legacy_size; fpu_user_cfg.default_size = legacy_size; guest_default_cfg.size = legacy_size; /* * Prevent enabling the static branch which enables writes to the * XFD MSR. */ init_fpstate.xfd = 0; fpstate_reset(x86_task_fpu(current)); } static u64 __init host_default_mask(void) { /* * Exclude dynamic features (require userspace opt-in) and features * that are supported only for KVM guests. */ return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR); } static u64 __init guest_default_mask(void) { /* * Exclude dynamic features, which require userspace opt-in even * for KVM guests. */ return ~(u64)XFEATURE_MASK_USER_DYNAMIC; } /* * Enable and initialize the xsave feature. * Called once per system bootup. */ void __init fpu__init_system_xstate(unsigned int legacy_size) { unsigned int eax, ebx, ecx, edx; u64 xfeatures; int err; int i; if (!boot_cpu_has(X86_FEATURE_FPU)) { pr_info("x86/fpu: No FPU detected\n"); return; } if (!boot_cpu_has(X86_FEATURE_XSAVE)) { pr_info("x86/fpu: x87 FPU will use %s\n", boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); return; } /* * Find user xstates supported by the processor. */ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); /* * Find supervisor xstates supported by the processor. */ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue * booting without it. This is too early to BUG(). */ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", fpu_kernel_cfg.max_features); goto out_disable; } if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX && fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) { /* * This is a problematic CPU configuration where two * conflicting state components are both enumerated. */ pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n", fpu_kernel_cfg.max_features); goto out_disable; } fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & XFEATURE_MASK_INDEPENDENT; /* * Clear XSAVE features that are disabled in the normal CPUID. */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { unsigned short cid = xsave_cpuid_features[i]; /* Careful: X86_FEATURE_FPU is 0! */ if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) fpu_kernel_cfg.max_features &= ~BIT_ULL(i); } if (!cpu_feature_enabled(X86_FEATURE_XFD)) fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; else fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED; fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; /* * Now, given maximum feature set, determine default values by * applying default masks. */ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask(); fpu_user_cfg.default_features = fpu_user_cfg.max_features & host_default_mask(); guest_default_cfg.features = fpu_kernel_cfg.max_features & guest_default_mask(); /* Store it for paranoia check at the end */ xfeatures = fpu_kernel_cfg.max_features; /* * Initialize the default XFD state in initfp_state and enable the * dynamic sizing mechanism if dynamic states are available. The * static key cannot be enabled here because this runs before * jump_label_init(). This is delayed to an initcall. */ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; /* Set up compaction feature bit */ if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || cpu_feature_enabled(X86_FEATURE_XSAVES)) setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); /* Cache size, offset and flags for initialization */ setup_xstate_cache(); err = init_xstate_size(); if (err) goto out_disable; /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ update_regset_xstate_info(fpu_user_cfg.max_size, fpu_user_cfg.max_features); /* * init_fpstate excludes dynamic states as they are large but init * state is zero. */ init_fpstate.size = fpu_kernel_cfg.default_size; init_fpstate.xfeatures = fpu_kernel_cfg.default_features; if (init_fpstate.size > sizeof(init_fpstate.regs)) { pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n", sizeof(init_fpstate.regs), init_fpstate.size); goto out_disable; } setup_init_fpu_buf(); /* * Paranoia check whether something in the setup modified the * xfeatures mask. */ if (xfeatures != fpu_kernel_cfg.max_features) { pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n", xfeatures, fpu_kernel_cfg.max_features); goto out_disable; } /* * CPU capabilities initialization runs before FPU init. So * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely * functional, set the feature bit so depending code works. */ setup_force_cpu_cap(X86_FEATURE_OSXSAVE); print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", fpu_kernel_cfg.max_features, fpu_kernel_cfg.max_size, boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); return; out_disable: /* something went wrong, try to boot without any XSAVE support */ fpu__init_disable_system_xstate(legacy_size); } /* * Restore minimal FPU state after suspend: */ void fpu__resume_cpu(void) { /* * Restore XCR0 on xsave capable CPUs: */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * Restore IA32_XSS. The same CPUID bit enumerates support * of XSAVES and MSR_IA32_XSS. */ if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } if (fpu_state_size_dynamic()) wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd); } /* * Given an xstate feature nr, calculate where in the xsave * buffer the state is. Callers should ensure that the buffer * is valid. */ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { u64 xcomp_bv = xsave->header.xcomp_bv; if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) return NULL; } return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr); } /* * Given the xsave area and a state inside, this function returns the * address of the state. * * This is the API that is called to get xstate address in either * standard format or compacted format of xsave area. * * Note that if there is no data for the field in the xsave buffer * this will return NULL. * * Inputs: * xstate: the thread's storage area for all FPU data * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, * XFEATURE_SSE, etc...) * Output: * address of the state in the xsave area, or NULL if the * field is not present in the xsave buffer. */ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { /* * Do we even *have* xsave state? */ if (!boot_cpu_has(X86_FEATURE_XSAVE)) return NULL; /* * We should not ever be requesting features that we * have not enabled. */ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; /* * This assumes the last 'xsave*' instruction to * have requested that 'xfeature_nr' be saved. * If it did not, we might be seeing and old value * of the field in the buffer. * * This can happen because the last 'xsave' did not * request that this feature be saved (unlikely) * or because the "init optimization" caused it * to not be saved. */ if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) return NULL; return __raw_xsave_addr(xsave, xfeature_nr); } EXPORT_SYMBOL_FOR_KVM(get_xsave_addr); /* * Given an xstate feature nr, calculate where in the xsave buffer the state is. * The xsave buffer should be in standard format, not compacted (e.g. user mode * signal frames). */ void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr) { if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; return (void __user *)xsave + xstate_offsets[xfeature_nr]; } #ifdef CONFIG_ARCH_HAS_PKEYS /* * This will go out and modify PKRU register to set the access * rights for @pkey to @init_val. */ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { u32 old_pkru, new_pkru_bits = 0; int pkey_shift; /* * This check implies XSAVE support. OSPKE only gets * set if we enable XSAVE and we enable PKU in XCR0. */ if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return -EINVAL; /* * This code should only be called with valid 'pkey' * values originating from in-kernel users. Complain * if a bad value is observed. */ if (WARN_ON_ONCE(pkey >= arch_max_pkey())) return -EINVAL; /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) new_pkru_bits |= PKRU_AD_BIT; if (init_val & PKEY_DISABLE_WRITE) new_pkru_bits |= PKRU_WD_BIT; /* Shift the bits in to the correct place in PKRU for pkey: */ pkey_shift = pkey * PKRU_BITS_PER_PKEY; new_pkru_bits <<= pkey_shift; /* Get old PKRU and mask off any old bits in place: */ old_pkru = read_pkru(); old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); /* Write old part along with new part: */ write_pkru(old_pkru | new_pkru_bits); return 0; } #endif /* ! CONFIG_ARCH_HAS_PKEYS */ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, void *init_xstate, unsigned int size) { membuf_write(to, from_xstate ? xstate : init_xstate, size); } /** * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @fpstate: The fpstate buffer from which to copy * @xfeatures: The mask of xfeatures to save (XSAVE mode only) * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming * format, i.e. from the kernel internal hardware dependent storage format * to the requested @mode. UABI XSTATE is always uncompacted! * * It supports partial copy but @to.pos always starts from zero. */ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u64 xfeatures, u32 pkru_val, enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int zerofrom, i, xfeature; struct xstate_header header; u64 mask; memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; /* Mask out the feature bits depending on copy mode */ switch (copy_mode) { case XSTATE_COPY_FP: header.xfeatures &= XFEATURE_MASK_FP; break; case XSTATE_COPY_FX: header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; break; case XSTATE_COPY_XSAVE: header.xfeatures &= fpstate->user_xfeatures & xfeatures; break; } /* Copy FP state up to MXCSR */ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, &xinit->i387, off_mxcsr); /* Copy MXCSR when SSE or YMM are set in the feature mask */ copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, MXCSR_AND_FLAGS_SIZE); /* Copy the remaining FP state */ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387.st_space, &xinit->i387.st_space, sizeof(xsave->i387.st_space)); /* Copy the SSE state - shared with YMM, but independently managed */ copy_feature(header.xfeatures & XFEATURE_MASK_SSE, &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, sizeof(xsave->i387.xmm_space)); if (copy_mode != XSTATE_COPY_XSAVE) goto out; /* Zero the padding area */ membuf_zero(&to, sizeof(xsave->i387.padding)); /* Copy xsave->i387.sw_reserved */ membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); /* Copy the user space relevant state of @xsave->header */ membuf_write(&to, &header, sizeof(header)); zerofrom = offsetof(struct xregs_state, extended_state_area); /* * This 'mask' indicates which states to copy from fpstate. * Those extended states that are not present in fpstate are * either disabled or initialized: * * In non-compacted format, disabled features still occupy * state space but there is no state to copy from in the * compacted init_fpstate. The gap tracking will zero these * states. * * The extended features have an all zeroes init state. Thus, * remove them from 'mask' to zero those features in the user * buffer instead of retrieving them from init_fpstate. */ mask = header.xfeatures; for_each_extended_xfeature_in_order(i, mask) { xfeature = xfeature_uncompact_order[i]; /* * If there was a feature or alignment gap, zero the space * in the destination buffer. */ if (zerofrom < xstate_offsets[xfeature]) membuf_zero(&to, xstate_offsets[xfeature] - zerofrom); if (xfeature == XFEATURE_PKRU) { struct pkru_state pkru = {0}; /* * PKRU is not necessarily up to date in the * XSAVE buffer. Use the provided value. */ pkru.pkru = pkru_val; membuf_write(&to, &pkru, sizeof(pkru)); } else { membuf_write(&to, __raw_xsave_addr(xsave, xfeature), xstate_sizes[xfeature]); } /* * Keep track of the last copied state in the non-compacted * target buffer for gap zeroing. */ zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature]; } out: if (to.left) membuf_zero(&to, to.left); } /** * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @tsk: The task from which to copy the saved xstate * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming * format, i.e. from the kernel internal hardware dependent storage format * to the requested @mode. UABI XSTATE is always uncompacted! * * It supports partial copy but @to.pos always starts from zero. */ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { __copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate, x86_task_fpu(tsk)->fpstate->user_xfeatures, tsk->thread.pkru, copy_mode); } static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, const void *kbuf, const void __user *ubuf) { if (kbuf) { memcpy(dst, kbuf + offset, size); } else { if (copy_from_user(dst, ubuf + offset, size)) return -EFAULT; } return 0; } /** * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate * @fpstate: The fpstate buffer to copy to * @kbuf: The UABI format buffer, if it comes from the kernel * @ubuf: The UABI format buffer, if it comes from userspace * @pkru: The location to write the PKRU value to * * Converts from the UABI format into the kernel internal hardware * dependent format. * * This function ultimately has three different callers with distinct PKRU * behavior. * 1. When called from sigreturn the PKRU register will be restored from * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to * @fpstate is sufficient to cover this case, but the caller will also * pass a pointer to the thread_struct's pkru field in @pkru and updating * it is harmless. * 2. When called from ptrace the PKRU register will be restored from the * thread_struct's pkru field. A pointer to that is passed in @pkru. * The kernel will restore it manually, so the XRSTOR behavior that resets * the PKRU register to the hardware init value (0) if the corresponding * xfeatures bit is not set is emulated here. * 3. When called from KVM the PKRU register will be restored from the vcpu's * pkru field. A pointer to that is passed in @pkru. KVM hasn't used * XRSTOR and hasn't had the PKRU resetting behavior described above. To * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures * bit is not set. */ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, const void __user *ubuf, u32 *pkru) { struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int offset, size; struct xstate_header hdr; u64 mask; int i; offset = offsetof(struct xregs_state, header); if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) return -EFAULT; if (validate_user_xstate_header(&hdr, fpstate)) return -EINVAL; /* Validate MXCSR when any of the related features is in use */ mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; if (hdr.xfeatures & mask) { u32 mxcsr[2]; offset = offsetof(struct fxregs_state, mxcsr); if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf)) return -EFAULT; /* Reserved bits in MXCSR must be zero. */ if (mxcsr[0] & ~mxcsr_feature_mask) return -EINVAL; /* SSE and YMM require MXCSR even when FP is not in use. */ if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { xsave->i387.mxcsr = mxcsr[0]; xsave->i387.mxcsr_mask = mxcsr[1]; } } for (i = 0; i < XFEATURE_MAX; i++) { mask = BIT_ULL(i); if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) return -EFAULT; } } if (hdr.xfeatures & XFEATURE_MASK_PKRU) { struct pkru_state *xpkru; xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU); *pkru = xpkru->pkru; } else { /* * KVM may pass NULL here to indicate that it does not need * PKRU updated. */ if (pkru) *pkru = 0; } /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': */ xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; /* * Add back in the features that came in from userspace: */ xsave->header.xfeatures |= hdr.xfeatures; return 0; } /* * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] * format and copy to the target thread. Used by ptrace and KVM. */ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru) { return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru); } /* * Convert from a sigreturn standard-format user-space buffer to kernel * XSAVE[S] format and copy to the target thread. This is called from the * sigreturn() and rt_sigreturn() system calls. */ int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf) { return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru); } static bool validate_independent_components(u64 mask) { u64 xchk; if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) return false; xchk = ~xfeatures_mask_independent(); if (WARN_ON_ONCE(!mask || mask & xchk)) return false; return true; } /** * xsaves - Save selected components to a kernel xstate buffer * @xstate: Pointer to the buffer * @mask: Feature mask to select the components to save * * The @xstate buffer must be 64 byte aligned and correctly initialized as * XSAVES does not write the full xstate header. Before first use the * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer * can #GP. * * The feature mask must be a subset of the independent features. */ void xsaves(struct xregs_state *xstate, u64 mask) { int err; if (!validate_independent_components(mask)) return; XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } /** * xrstors - Restore selected components from a kernel xstate buffer * @xstate: Pointer to the buffer * @mask: Feature mask to select the components to restore * * The @xstate buffer must be 64 byte aligned and correctly initialized * otherwise XRSTORS from that buffer can #GP. * * Proper usage is to restore the state which was saved with * xsaves() into @xstate. * * The feature mask must be a subset of the independent features. */ void xrstors(struct xregs_state *xstate, u64 mask) { int err; if (!validate_independent_components(mask)) return; XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } #if IS_ENABLED(CONFIG_KVM) void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature) { void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature); if (addr) memset(addr, 0, xstate_sizes[xfeature]); } EXPORT_SYMBOL_FOR_KVM(fpstate_clear_xstate_component); #endif #ifdef CONFIG_X86_64 #ifdef CONFIG_X86_DEBUG_FPU /* * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask * can safely operate on the @fpstate buffer. */ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) { u64 xfd = __this_cpu_read(xfd_state); if (fpstate->xfd == xfd) return true; /* * The XFD MSR does not match fpstate->xfd. That's invalid when * the passed in fpstate is current's fpstate. */ if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd) return false; /* * XRSTOR(S) from init_fpstate are always correct as it will just * bring all components into init state and not read from the * buffer. XSAVE(S) raises #PF after init. */ if (fpstate == &init_fpstate) return rstor; /* * XSAVE(S): clone(), fpu_swap_kvm_fpstate() * XRSTORS(S): fpu_swap_kvm_fpstate() */ /* * No XSAVE/XRSTOR instructions (except XSAVE itself) touch * the buffer area for XFD-disabled state components. */ mask &= ~xfd; /* * Remove features which are valid in fpstate. They * have space allocated in fpstate. */ mask &= ~fpstate->xfeatures; /* * Any remaining state components in 'mask' might be written * by XSAVE/XRSTOR. Fail validation it found. */ return !mask; } void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); } #endif /* CONFIG_X86_DEBUG_FPU */ static int __init xfd_update_static_branch(void) { /* * If init_fpstate.xfd has bits set then dynamic features are * available and the dynamic sizing must be enabled. */ if (init_fpstate.xfd) static_branch_enable(&__fpu_state_size_dynamic); return 0; } arch_initcall(xfd_update_static_branch) void fpstate_free(struct fpu *fpu) { if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) vfree(fpu->fpstate); } /** * fpstate_realloc - Reallocate struct fpstate for the requested new features * * @xfeatures: A bitmap of xstate features which extend the enabled features * of that task * @ksize: The required size for the kernel buffer * @usize: The required size for user space buffers * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations * * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer * terminates quickly, vfree()-induced IPIs may be a concern, but tasks * with large states are likely to live longer. * * Returns: 0 on success, -ENOMEM on allocation error. */ static int fpstate_realloc(u64 xfeatures, unsigned int ksize, unsigned int usize, struct fpu_guest *guest_fpu) { struct fpu *fpu = x86_task_fpu(current); struct fpstate *curfps, *newfps = NULL; unsigned int fpsize; bool in_use; fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); newfps = vzalloc(fpsize); if (!newfps) return -ENOMEM; newfps->size = ksize; newfps->user_size = usize; newfps->is_valloc = true; /* * When a guest FPU is supplied, use @guest_fpu->fpstate * as reference independent whether it is in use or not. */ curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; /* Determine whether @curfps is the active fpstate */ in_use = fpu->fpstate == curfps; if (guest_fpu) { newfps->is_guest = true; newfps->is_confidential = curfps->is_confidential; newfps->in_use = curfps->in_use; guest_fpu->xfeatures |= xfeatures; guest_fpu->uabi_size = usize; } fpregs_lock(); /* * If @curfps is in use, ensure that the current state is in the * registers before swapping fpstate as that might invalidate it * due to layout changes. */ if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); newfps->xfeatures = curfps->xfeatures | xfeatures; newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; newfps->xfd = curfps->xfd & ~xfeatures; /* Do the final updates within the locked region */ xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); if (guest_fpu) { guest_fpu->fpstate = newfps; /* If curfps is active, update the FPU fpstate pointer */ if (in_use) fpu->fpstate = newfps; } else { fpu->fpstate = newfps; } if (in_use) xfd_update_state(fpu->fpstate); fpregs_unlock(); /* Only free valloc'ed state */ if (curfps && curfps->is_valloc) vfree(curfps); return 0; } static int validate_sigaltstack(unsigned int usize) { struct task_struct *thread, *leader = current->group_leader; unsigned long framesize = get_sigframe_size(); lockdep_assert_held(&current->sighand->siglock); /* get_sigframe_size() is based on fpu_user_cfg.max_size */ framesize -= fpu_user_cfg.max_size; framesize += usize; for_each_thread(leader, thread) { if (thread->sas_ss_size && thread->sas_ss_size < framesize) return -ENOSPC; } return 0; } static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) { /* * This deliberately does not exclude !XSAVES as we still might * decide to optionally context switch XCR0 or talk the silicon * vendors into extending XFD for the pre AMX states, especially * AVX512. */ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); struct fpu *fpu = x86_task_fpu(current->group_leader); struct fpu_state_perm *perm; unsigned int ksize, usize; u64 mask; int ret = 0; /* Check whether fully enabled */ if ((permitted & requested) == requested) return 0; /* * Calculate the resulting kernel state size. Note, @permitted also * contains supervisor xfeatures even though supervisor are always * permitted for kernel and guest FPUs, and never permitted for user * FPUs. */ mask = permitted | requested; ksize = xstate_calculate_size(mask, compacted); /* * Calculate the resulting user state size. Take care not to clobber * the supervisor xfeatures in the new mask! */ usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false); if (!guest) { ret = validate_sigaltstack(usize); if (ret) return ret; } perm = guest ? &fpu->guest_perm : &fpu->perm; /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ WRITE_ONCE(perm->__state_perm, mask); /* Protected by sighand lock */ perm->__state_size = ksize; perm->__user_state_size = usize; return ret; } /* * Permissions array to map facilities with more than one component */ static const u64 xstate_prctl_req[XFEATURE_MAX] = { [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, }; static int xstate_request_perm(unsigned long idx, bool guest) { u64 permitted, requested; int ret; if (idx >= XFEATURE_MAX) return -EINVAL; /* * Look up the facility mask which can require more than * one xstate component. */ idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); requested = xstate_prctl_req[idx]; if (!requested) return -EOPNOTSUPP; if ((fpu_user_cfg.max_features & requested) != requested) return -EOPNOTSUPP; /* Lockless quick check */ permitted = xstate_get_group_perm(guest); if ((permitted & requested) == requested) return 0; /* Protect against concurrent modifications */ spin_lock_irq(&current->sighand->siglock); permitted = xstate_get_group_perm(guest); /* First vCPU allocation locks the permissions. */ if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) ret = -EBUSY; else ret = __xstate_request_perm(permitted, requested, guest); spin_unlock_irq(&current->sighand->siglock); return ret; } int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) { u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; struct fpu_state_perm *perm; unsigned int ksize, usize; struct fpu *fpu; if (!xfd_event) { if (!guest_fpu) pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); return 0; } /* Protect against concurrent modifications */ spin_lock_irq(&current->sighand->siglock); /* If not permitted let it die */ if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { spin_unlock_irq(&current->sighand->siglock); return -EPERM; } fpu = x86_task_fpu(current->group_leader); perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; ksize = perm->__state_size; usize = perm->__user_state_size; /* * The feature is permitted. State size is sufficient. Dropping * the lock is safe here even if more features are added from * another task, the retrieved buffer sizes are valid for the * currently requested feature(s). */ spin_unlock_irq(&current->sighand->siglock); /* * Try to allocate a new fpstate. If that fails there is no way * out. */ if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) return -EFAULT; return 0; } int xfd_enable_feature(u64 xfd_err) { return __xfd_enable_feature(xfd_err, NULL); } #else /* CONFIG_X86_64 */ static inline int xstate_request_perm(unsigned long idx, bool guest) { return -EPERM; } #endif /* !CONFIG_X86_64 */ u64 xstate_get_guest_group_perm(void) { return xstate_get_group_perm(true); } EXPORT_SYMBOL_FOR_KVM(xstate_get_guest_group_perm); /** * fpu_xstate_prctl - xstate permission operations * @option: A subfunction of arch_prctl() * @arg2: option argument * Return: 0 if successful; otherwise, an error code * * Option arguments: * * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info * ARCH_REQ_XCOMP_PERM: Facility number requested * * For facilities which require more than one XSTATE component, the request * must be the highest state component number related to that facility, * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). */ long fpu_xstate_prctl(int option, unsigned long arg2) { u64 __user *uptr = (u64 __user *)arg2; u64 permitted, supported; unsigned long idx = arg2; bool guest = false; switch (option) { case ARCH_GET_XCOMP_SUPP: supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; return put_user(supported, uptr); case ARCH_GET_XCOMP_PERM: /* * Lockless snapshot as it can also change right after the * dropping the lock. */ permitted = xstate_get_host_group_perm(); permitted &= XFEATURE_MASK_USER_SUPPORTED; return put_user(permitted, uptr); case ARCH_GET_XCOMP_GUEST_PERM: permitted = xstate_get_guest_group_perm(); permitted &= XFEATURE_MASK_USER_SUPPORTED; return put_user(permitted, uptr); case ARCH_REQ_XCOMP_GUEST_PERM: guest = true; fallthrough; case ARCH_REQ_XCOMP_PERM: if (!IS_ENABLED(CONFIG_X86_64)) return -EOPNOTSUPP; return xstate_request_perm(idx, guest); default: return -EINVAL; } } #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 * use in the task. Report -1 if no AVX-512 usage. */ static void avx512_status(struct seq_file *m, struct task_struct *task) { unsigned long timestamp; long delta = -1; /* AVX-512 usage is not tracked for kernel threads. Don't report anything. */ if (task->flags & (PF_KTHREAD | PF_USER_WORKER)) return; timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); if (timestamp) { delta = (long)(jiffies - timestamp); /* * Cap to LONG_MAX if time difference > LONG_MAX */ if (delta < 0) delta = LONG_MAX; delta = jiffies_to_msecs(delta); } seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); seq_putc(m, '\n'); } /* * Report architecture specific information */ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { /* * Report AVX512 state if the processor and build option supported. */ if (cpu_feature_enabled(X86_FEATURE_AVX512F)) avx512_status(m, task); return 0; } #endif /* CONFIG_PROC_PID_ARCH_STATUS */ #ifdef CONFIG_COREDUMP static const char owner_name[] = "LINUX"; /* * Dump type, size, offset and flag values for every xfeature that is present. */ static int dump_xsave_layout_desc(struct coredump_params *cprm) { int num_records = 0; int i; for_each_extended_xfeature(i, fpu_user_cfg.max_features) { struct x86_xfeat_component xc = { .type = i, .size = xstate_sizes[i], .offset = xstate_offsets[i], /* reserved for future use */ .flags = 0, }; if (!dump_emit(cprm, &xc, sizeof(xc))) return -1; num_records++; } return num_records; } static u32 get_xsave_desc_size(void) { u32 cnt = 0; u32 i; for_each_extended_xfeature(i, fpu_user_cfg.max_features) cnt++; return cnt * (sizeof(struct x86_xfeat_component)); } int elf_coredump_extra_notes_write(struct coredump_params *cprm) { int num_records = 0; struct elf_note en; if (!fpu_user_cfg.max_features) return 0; en.n_namesz = sizeof(owner_name); en.n_descsz = get_xsave_desc_size(); en.n_type = NT_X86_XSAVE_LAYOUT; if (!dump_emit(cprm, &en, sizeof(en))) return 1; if (!dump_emit(cprm, owner_name, en.n_namesz)) return 1; if (!dump_align(cprm, 4)) return 1; num_records = dump_xsave_layout_desc(cprm); if (num_records < 0) return 1; /* Total size should be equal to the number of records */ if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz) return 1; return 0; } int elf_coredump_extra_notes_size(void) { int size; if (!fpu_user_cfg.max_features) return 0; /* .note header */ size = sizeof(struct elf_note); /* Name plus alignment to 4 bytes */ size += roundup(sizeof(owner_name), 4); size += get_xsave_desc_size(); return size; } #endif /* CONFIG_COREDUMP */
1 1 619 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/bad_inode.c * * Copyright (C) 1997, Stephen Tweedie * * Provide stub functions for unreadable inodes * * Fabian Frederick : August 2003 - All file operations assigned to EIO */ #include <linux/fs.h> #include <linux/export.h> #include <linux/stat.h> #include <linux/time.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/fiemap.h> static int bad_file_open(struct inode *inode, struct file *filp) { return -EIO; } static const struct file_operations bad_file_ops = { .open = bad_file_open, }; static int bad_inode_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return -EIO; } static struct dentry *bad_inode_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-EIO); } static int bad_inode_link (struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { return -EIO; } static struct dentry *bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { return ERR_PTR(-EIO); } static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) { return -EIO; } static int bad_inode_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } static int bad_inode_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EIO; } static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, int buflen) { return -EIO; } static int bad_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return -EIO; } static int bad_inode_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } static int bad_inode_setattr(struct mnt_idmap *idmap, struct dentry *direntry, struct iattr *attrs) { return -EIO; } static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { return -EIO; } static const char *bad_inode_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return ERR_PTR(-EIO); } static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu) { return ERR_PTR(-EIO); } static int bad_inode_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { return -EIO; } static int bad_inode_update_time(struct inode *inode, int flags) { return -EIO; } static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, struct file *file, unsigned int open_flag, umode_t create_mode) { return -EIO; } static int bad_inode_tmpfile(struct mnt_idmap *idmap, struct inode *inode, struct file *file, umode_t mode) { return -EIO; } static int bad_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { return -EIO; } static const struct inode_operations bad_inode_ops = { .create = bad_inode_create, .lookup = bad_inode_lookup, .link = bad_inode_link, .unlink = bad_inode_unlink, .symlink = bad_inode_symlink, .mkdir = bad_inode_mkdir, .rmdir = bad_inode_rmdir, .mknod = bad_inode_mknod, .rename = bad_inode_rename2, .readlink = bad_inode_readlink, .permission = bad_inode_permission, .getattr = bad_inode_getattr, .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, .get_link = bad_inode_get_link, .get_inode_acl = bad_inode_get_acl, .fiemap = bad_inode_fiemap, .update_time = bad_inode_update_time, .atomic_open = bad_inode_atomic_open, .tmpfile = bad_inode_tmpfile, .set_acl = bad_inode_set_acl, }; /* * When a filesystem is unable to read an inode due to an I/O error in * its read_inode() function, it can call make_bad_inode() to return a * set of stubs which will return EIO errors as required. * * We only need to do limited initialisation: all other fields are * preinitialised to zero automatically. */ /** * make_bad_inode - mark an inode bad due to an I/O error * @inode: Inode to mark bad * * When an inode cannot be read due to a media or remote network * failure this function makes the inode "bad" and causes I/O operations * on it to fail from this point on. */ void make_bad_inode(struct inode *inode) { remove_inode_hash(inode); inode->i_mode = S_IFREG; simple_inode_init_ts(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; } EXPORT_SYMBOL(make_bad_inode); /* * This tests whether an inode has been flagged as bad. The test uses * &bad_inode_ops to cover the case of invalidated inodes as well as * those created by make_bad_inode() above. */ /** * is_bad_inode - is an inode errored * @inode: inode to test * * Returns true if the inode in question has been marked as bad. */ bool is_bad_inode(struct inode *inode) { return (inode->i_op == &bad_inode_ops); } EXPORT_SYMBOL(is_bad_inode); /** * iget_failed - Mark an under-construction inode as dead and release it * @inode: The inode to discard * * Mark an under-construction inode as dead and release it. */ void iget_failed(struct inode *inode) { make_bad_inode(inode); unlock_new_inode(inode); iput(inode); } EXPORT_SYMBOL(iget_failed);
1 1 1 1 1 1 1 3 3 3 2 1 11 11 1 10 2 8 1 7 1 1 6 1 5 1 4 3 1 1 10 23 10 10 8 1 8 2 6 3 1 1 23 40 39 39 39 39 1 8 8 1 7 5 7 40 37 28 9 9 9 4 2 2 6 6 3 2 10 7 1 7 1 6 6 6 6 2 6 9 20 17 20 5 7 17 20 20 17 5 4 3 2 2 2 5 3 3 3 3 2 3 2 1 3 11 10 9 2 7 6 5 5 7 4 3 1 1 1 4 260 213 98 65 33 8 8 1 7 88 4 35 118 25 186 3 3 3 3 3 3 3 3 2 3 3 2 4 1 5 5 3 2 3 2 3 3 2 5 1 262 4 3 3 2 5 2 1 3 3 2 1 2 4 2 2 2 1 3 39 6 4 3 43 103 17 16 23 13 6 2 1 5 1 5 5 5 13 1 4 2 1 3 1 1 5 4 1 3 2 1 1 9 3 8 6 4 3 1 2 5 2 1 1 1 3 3 1 2 8 7 6 1 5 5 1 4 10 9 8 2 6 5 2 4 4 8 4 5 4 3 2 1 1 3 4 4 3 10 23 13 12 138 38 9 5 262 261 260 57 58 3 2 1 3 137 137 128 136 129 125 126 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 4 4 4 4 2 4 2 2 2 1 2 2 1 2 3 2 2 2 1 2 2 2 4 1 1 1 1 8 1 90 7 89 4 3 2 4 3 3 2 80 31 3 3 1 3 27 27 28 7 7 137 136 82 81 80 81 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The IP to API glue. * * Authors: see ip.c * * Fixes: * Many : Split from ip.c , see ip.c for history. * Martin Mares : TOS setting fixed. * Alan Cox : Fixed a couple of oopses in Martin's * TOS tweaks. * Mike McLagan : Routing by source */ #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp_states.h> #include <linux/udp.h> #include <linux/igmp.h> #include <linux/netfilter.h> #include <linux/route.h> #include <linux/mroute.h> #include <net/inet_ecn.h> #include <net/route.h> #include <net/xfrm.h> #include <net/compat.h> #include <net/checksum.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/transp_v6.h> #endif #include <net/ip_fib.h> #include <linux/errqueue.h> #include <linux/uaccess.h> /* * SOL_IP control messages. */ static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct in_pktinfo info = *PKTINFO_SKB_CB(skb); info.ipi_addr.s_addr = ip_hdr(skb)->daddr; put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) { int ttl = ip_hdr(skb)->ttl; put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl); } static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) { put_cmsg(msg, SOL_IP, IP_TOS, 1, &ip_hdr(skb)->tos); } static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) { if (IPCB(skb)->opt.optlen == 0) return; put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, ip_hdr(skb) + 1); } static void ip_cmsg_recv_retopts(struct net *net, struct msghdr *msg, struct sk_buff *skb) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; if (IPCB(skb)->opt.optlen == 0) return; if (ip_options_echo(net, opt, skb)) { msg->msg_flags |= MSG_CTRUNC; return; } ip_options_undo(opt); put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); } static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb) { int val; if (IPCB(skb)->frag_max_size == 0) return; val = IPCB(skb)->frag_max_size; put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val); } static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, int tlen, int offset) { __wsum csum = skb->csum; if (skb->ip_summed != CHECKSUM_COMPLETE) return; if (offset != 0) { int tend_off = skb_transport_offset(skb) + tlen; csum = csum_sub(csum, skb_checksum(skb, tend_off, offset, 0)); } put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum); } static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) { struct lsm_context ctx; u32 secid; int err; err = security_socket_getpeersec_dgram(NULL, skb, &secid); if (err) return; err = security_secid_to_secctx(secid, &ctx); if (err < 0) return; put_cmsg(msg, SOL_IP, SCM_SECURITY, ctx.len, ctx.context); security_release_secctx(&ctx); } static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) { __be16 _ports[2], *ports; struct sockaddr_in sin; /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ ports = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_ports), &_ports); if (!ports) return; sin.sin_family = AF_INET; sin.sin_addr.s_addr = ip_hdr(skb)->daddr; sin.sin_port = ports[1]; memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); put_cmsg(msg, SOL_IP, IP_ORIGDSTADDR, sizeof(sin), &sin); } void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, int tlen, int offset) { unsigned long flags = inet_cmsg_flags(inet_sk(sk)); if (!flags) return; /* Ordered by supposed usage frequency */ if (flags & IP_CMSG_PKTINFO) { ip_cmsg_recv_pktinfo(msg, skb); flags &= ~IP_CMSG_PKTINFO; if (!flags) return; } if (flags & IP_CMSG_TTL) { ip_cmsg_recv_ttl(msg, skb); flags &= ~IP_CMSG_TTL; if (!flags) return; } if (flags & IP_CMSG_TOS) { ip_cmsg_recv_tos(msg, skb); flags &= ~IP_CMSG_TOS; if (!flags) return; } if (flags & IP_CMSG_RECVOPTS) { ip_cmsg_recv_opts(msg, skb); flags &= ~IP_CMSG_RECVOPTS; if (!flags) return; } if (flags & IP_CMSG_RETOPTS) { ip_cmsg_recv_retopts(sock_net(sk), msg, skb); flags &= ~IP_CMSG_RETOPTS; if (!flags) return; } if (flags & IP_CMSG_PASSSEC) { ip_cmsg_recv_security(msg, skb); flags &= ~IP_CMSG_PASSSEC; if (!flags) return; } if (flags & IP_CMSG_ORIGDSTADDR) { ip_cmsg_recv_dstaddr(msg, skb); flags &= ~IP_CMSG_ORIGDSTADDR; if (!flags) return; } if (flags & IP_CMSG_CHECKSUM) ip_cmsg_recv_checksum(msg, skb, tlen, offset); if (flags & IP_CMSG_RECVFRAGSIZE) ip_cmsg_recv_fragsize(msg, skb); } EXPORT_SYMBOL(ip_cmsg_recv_offset); int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6) { int err, val; struct cmsghdr *cmsg; struct net *net = sock_net(sk); for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; #if IS_ENABLED(CONFIG_IPV6) if (allow_ipv6 && cmsg->cmsg_level == SOL_IPV6 && cmsg->cmsg_type == IPV6_PKTINFO) { struct in6_pktinfo *src_info; if (cmsg->cmsg_len < CMSG_LEN(sizeof(*src_info))) return -EINVAL; src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); if (!ipv6_addr_v4mapped(&src_info->ipi6_addr)) return -EINVAL; if (src_info->ipi6_ifindex) ipc->oif = src_info->ipi6_ifindex; ipc->addr = src_info->ipi6_addr.s6_addr32[3]; continue; } #endif if (cmsg->cmsg_level == SOL_SOCKET) { err = __sock_cmsg_send(sk, cmsg, &ipc->sockc); if (err) return err; continue; } if (cmsg->cmsg_level != SOL_IP) continue; switch (cmsg->cmsg_type) { case IP_RETOPTS: err = cmsg->cmsg_len - sizeof(struct cmsghdr); /* Our caller is responsible for freeing ipc->opt */ err = ip_options_get(net, &ipc->opt, KERNEL_SOCKPTR(CMSG_DATA(cmsg)), err < 40 ? err : 40); if (err) return err; break; case IP_PKTINFO: { struct in_pktinfo *info; if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) return -EINVAL; info = (struct in_pktinfo *)CMSG_DATA(cmsg); if (info->ipi_ifindex) ipc->oif = info->ipi_ifindex; ipc->addr = info->ipi_spec_dst.s_addr; break; } case IP_TTL: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) return -EINVAL; val = *(int *)CMSG_DATA(cmsg); if (val < 1 || val > 255) return -EINVAL; ipc->ttl = val; break; case IP_TOS: if (cmsg->cmsg_len == CMSG_LEN(sizeof(int))) val = *(int *)CMSG_DATA(cmsg); else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8))) val = *(u8 *)CMSG_DATA(cmsg); else return -EINVAL; if (val < 0 || val > 255) return -EINVAL; ipc->tos = val; ipc->sockc.priority = rt_tos2priority(ipc->tos); break; case IP_PROTOCOL: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) return -EINVAL; val = *(int *)CMSG_DATA(cmsg); if (val < 1 || val > 255) return -EINVAL; ipc->protocol = val; break; default: return -EINVAL; } } return 0; } static void ip_ra_destroy_rcu(struct rcu_head *head) { struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); sock_put(ra->saved_sk); kfree(ra); } int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) { struct ip_ra_chain *ra, *new_ra; struct ip_ra_chain __rcu **rap; struct net *net = sock_net(sk); if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) return -EINVAL; new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; if (on && !new_ra) return -ENOMEM; mutex_lock(&net->ipv4.ra_mutex); for (rap = &net->ipv4.ra_chain; (ra = rcu_dereference_protected(*rap, lockdep_is_held(&net->ipv4.ra_mutex))) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (on) { mutex_unlock(&net->ipv4.ra_mutex); kfree(new_ra); return -EADDRINUSE; } /* dont let ip_call_ra_chain() use sk again */ ra->sk = NULL; RCU_INIT_POINTER(*rap, ra->next); mutex_unlock(&net->ipv4.ra_mutex); if (ra->destructor) ra->destructor(sk); /* * Delay sock_put(sk) and kfree(ra) after one rcu grace * period. This guarantee ip_call_ra_chain() dont need * to mess with socket refcounts. */ ra->saved_sk = sk; call_rcu(&ra->rcu, ip_ra_destroy_rcu); return 0; } } if (!new_ra) { mutex_unlock(&net->ipv4.ra_mutex); return -ENOBUFS; } new_ra->sk = sk; new_ra->destructor = destructor; RCU_INIT_POINTER(new_ra->next, ra); rcu_assign_pointer(*rap, new_ra); sock_hold(sk); mutex_unlock(&net->ipv4.ra_mutex); return 0; } static void ipv4_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out) { switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: case ICMP_TIME_EXCEEDED: case ICMP_PARAMETERPROB: ip_icmp_error_rfc4884(skb, out, sizeof(struct icmphdr), icmp_hdr(skb)->un.reserved[1] * 4); } } void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { struct sock_exterr_skb *serr; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) return; serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_ICMP; serr->ee.ee_type = icmp_hdr(skb)->type; serr->ee.ee_code = icmp_hdr(skb)->code; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&(((struct iphdr *)(icmp_hdr(skb) + 1))->daddr) - skb_network_header(skb); serr->port = port; if (skb_pull(skb, payload - skb->data)) { if (inet_test_bit(RECVERR_RFC4884, sk)) ipv4_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb) == 0) return; } kfree_skb(skb); } EXPORT_SYMBOL_GPL(ip_icmp_error); void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info) { struct sock_exterr_skb *serr; struct iphdr *iph; struct sk_buff *skb; if (!inet_test_bit(RECVERR, sk)) return; skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC); if (!skb) return; skb_put(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->daddr = daddr; serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_type = 0; serr->ee.ee_code = 0; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); serr->port = port; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } /* For some errors we have valid addr_offset even with zero payload and * zero port. Also, addr_offset should be supported if port is set. */ static inline bool ipv4_datagram_support_addr(struct sock_exterr_skb *serr) { return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; } /* IPv4 supports cmsg on all imcp errors and some timestamps * * Timestamp code paths do not initialize the fields expected by cmsg: * the PKTINFO fields in skb->cb[]. Fill those in here. */ static bool ipv4_datagram_support_cmsg(const struct sock *sk, struct sk_buff *skb, int ee_origin) { struct in_pktinfo *info; if (ee_origin == SO_EE_ORIGIN_ICMP) return true; if (ee_origin == SO_EE_ORIGIN_LOCAL) return false; /* Support IP_PKTINFO on tstamp packets if requested, to correlate * timestamp with egress dev. Not possible for packets without iif * or without payload (SOF_TIMESTAMPING_OPT_TSONLY). */ info = PKTINFO_SKB_CB(skb); if (!(READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_CMSG) || !info->ipi_ifindex) return false; info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; return true; } /* * Handle MSG_ERRQUEUE */ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct sock_exterr_skb *serr; struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in offender; } errhdr; int err; int copied; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (unlikely(err)) { kfree_skb(skb); return err; } sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); if (sin && ipv4_datagram_support_addr(serr)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); sin->sin_port = serr->port; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; if (inet_cmsg_flags(inet_sk(sk))) ip_cmsg_recv(msg, skb); } put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr); /* Now we could try to dump offended packet options */ msg->msg_flags |= MSG_ERRQUEUE; err = copied; consume_skb(skb); out: return err; } void __ip_sock_set_tos(struct sock *sk, int val) { u8 old_tos = inet_sk(sk)->tos; if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; val |= old_tos & INET_ECN_MASK; } if (old_tos != val) { WRITE_ONCE(inet_sk(sk)->tos, val); WRITE_ONCE(sk->sk_priority, rt_tos2priority(val)); sk_dst_reset(sk); } } void ip_sock_set_tos(struct sock *sk, int val) { sockopt_lock_sock(sk); __ip_sock_set_tos(sk, val); sockopt_release_sock(sk); } EXPORT_SYMBOL(ip_sock_set_tos); void ip_sock_set_freebind(struct sock *sk) { inet_set_bit(FREEBIND, sk); } EXPORT_SYMBOL(ip_sock_set_freebind); void ip_sock_set_recverr(struct sock *sk) { inet_set_bit(RECVERR, sk); } EXPORT_SYMBOL(ip_sock_set_recverr); int ip_sock_set_mtu_discover(struct sock *sk, int val) { if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) return -EINVAL; WRITE_ONCE(inet_sk(sk)->pmtudisc, val); return 0; } EXPORT_SYMBOL(ip_sock_set_mtu_discover); void ip_sock_set_pktinfo(struct sock *sk) { inet_set_bit(PKTINFO, sk); } EXPORT_SYMBOL(ip_sock_set_pktinfo); /* * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. */ static bool setsockopt_needs_rtnl(int optname) { switch (optname) { case IP_ADD_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_DROP_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_MSFILTER: case IP_UNBLOCK_SOURCE: case MCAST_BLOCK_SOURCE: case MCAST_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_UNBLOCK_SOURCE: return true; } return false; } static int set_mcast_msfilter(struct sock *sk, int ifindex, int numsrc, int fmode, struct sockaddr_storage *group, struct sockaddr_storage *list) { struct ip_msfilter *msf; struct sockaddr_in *psin; int err, i; msf = kmalloc(IP_MSFILTER_SIZE(numsrc), GFP_KERNEL); if (!msf) return -ENOBUFS; psin = (struct sockaddr_in *)group; if (psin->sin_family != AF_INET) goto Eaddrnotavail; msf->imsf_multiaddr = psin->sin_addr.s_addr; msf->imsf_interface = 0; msf->imsf_fmode = fmode; msf->imsf_numsrc = numsrc; for (i = 0; i < numsrc; ++i) { psin = (struct sockaddr_in *)&list[i]; if (psin->sin_family != AF_INET) goto Eaddrnotavail; msf->imsf_slist_flex[i] = psin->sin_addr.s_addr; } err = ip_mc_msfilter(sk, msf, ifindex); kfree(msf); return err; Eaddrnotavail: kfree(msf); return -EADDRNOTAVAIL; } static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { if (in_compat_syscall()) { struct compat_group_source_req gr32; if (optlen != sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; greqs->gsr_interface = gr32.gsr_interface; greqs->gsr_group = gr32.gsr_group; greqs->gsr_source = gr32.gsr_source; } else { if (optlen != sizeof(*greqs)) return -EINVAL; if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) return -EFAULT; } return 0; } static int do_mcast_group_source(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct group_source_req greqs; struct ip_mreq_source mreqs; struct sockaddr_in *psin; int omode, add, err; err = copy_group_source_from_sockptr(&greqs, optval, optlen); if (err) return err; if (greqs.gsr_group.ss_family != AF_INET || greqs.gsr_source.ss_family != AF_INET) return -EADDRNOTAVAIL; psin = (struct sockaddr_in *)&greqs.gsr_group; mreqs.imr_multiaddr = psin->sin_addr.s_addr; psin = (struct sockaddr_in *)&greqs.gsr_source; mreqs.imr_sourceaddr = psin->sin_addr.s_addr; mreqs.imr_interface = 0; /* use index for mc_source */ if (optname == MCAST_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == MCAST_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct ip_mreqn mreq; psin = (struct sockaddr_in *)&greqs.gsr_group; mreq.imr_multiaddr = psin->sin_addr; mreq.imr_address.s_addr = 0; mreq.imr_ifindex = greqs.gsr_interface; err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) return err; greqs.gsr_interface = mreq.imr_ifindex; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } return ip_mc_source(add, omode, sk, &mreqs, greqs.gsr_interface); } static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { struct group_filter *gsf = NULL; int err; if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); if (IS_ERR(gsf)) return PTR_ERR(gsf); /* numsrc >= (4G-140)/128 overflow in 32 bits */ err = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffff || gsf->gf_numsrc > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf)) goto out_free_gsf; err = -EINVAL; if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc, gsf->gf_fmode, &gsf->gf_group, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return err; } static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; unsigned int n; void *p; int err; if (optlen < size0) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ err = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_gsf; /* numsrc >= (4G-140)/128 overflow in 32 bits */ n = gf32->gf_numsrc; err = -ENOBUFS; if (n >= 0x1ffffff) goto out_free_gsf; err = -EINVAL; if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_gsf; /* numsrc >= (4G-140)/128 overflow in 32 bits */ err = -ENOBUFS; if (n > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf)) goto out_free_gsf; err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode, &gf32->gf_group, gf32->gf_slist_flex); out_free_gsf: kfree(p); return err; } static int ip_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct ip_mreqn mreq = { }; struct sockaddr_in *psin; struct group_req greq; if (optlen < sizeof(struct group_req)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; psin = (struct sockaddr_in *)&greq.gr_group; if (psin->sin_family != AF_INET) return -EINVAL; mreq.imr_multiaddr = psin->sin_addr; mreq.imr_ifindex = greq.gr_interface; if (optname == MCAST_JOIN_GROUP) return ip_mc_join_group(sk, &mreq); return ip_mc_leave_group(sk, &mreq); } static int compat_ip_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct compat_group_req greq; struct ip_mreqn mreq = { }; struct sockaddr_in *psin; if (optlen < sizeof(struct compat_group_req)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; psin = (struct sockaddr_in *)&greq.gr_group; if (psin->sin_family != AF_INET) return -EINVAL; mreq.imr_multiaddr = psin->sin_addr; mreq.imr_ifindex = greq.gr_interface; if (optname == MCAST_JOIN_GROUP) return ip_mc_join_group(sk, &mreq); return ip_mc_leave_group(sk, &mreq); } DEFINE_STATIC_KEY_FALSE(ip4_min_ttl); int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); int val = 0, err, retv; bool needs_rtnl = setsockopt_needs_rtnl(optname); switch (optname) { case IP_PKTINFO: case IP_RECVTTL: case IP_RECVOPTS: case IP_RECVTOS: case IP_RETOPTS: case IP_TOS: case IP_TTL: case IP_HDRINCL: case IP_MTU_DISCOVER: case IP_RECVERR: case IP_ROUTER_ALERT: case IP_FREEBIND: case IP_PASSSEC: case IP_TRANSPARENT: case IP_MINTTL: case IP_NODEFRAG: case IP_BIND_ADDRESS_NO_PORT: case IP_UNICAST_IF: case IP_MULTICAST_TTL: case IP_MULTICAST_ALL: case IP_MULTICAST_LOOP: case IP_RECVORIGDSTADDR: case IP_CHECKSUM: case IP_RECVFRAGSIZE: case IP_RECVERR_RFC4884: case IP_LOCAL_PORT_RANGE: if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else if (optlen >= sizeof(char)) { unsigned char ucval; if (copy_from_sockptr(&ucval, optval, sizeof(ucval))) return -EFAULT; val = (int) ucval; } } /* If optlen==0, it is equivalent to val == 0 */ if (optname == IP_ROUTER_ALERT) { retv = ip_ra_control(sk, val ? 1 : 0, NULL); if (retv == 0) inet_assign_bit(RTALERT, sk, val); return retv; } if (ip_mroute_opt(optname)) return ip_mroute_setsockopt(sk, optname, optval, optlen); /* Handle options that can be set without locking the socket. */ switch (optname) { case IP_PKTINFO: inet_assign_bit(PKTINFO, sk, val); return 0; case IP_RECVTTL: inet_assign_bit(TTL, sk, val); return 0; case IP_RECVTOS: inet_assign_bit(TOS, sk, val); return 0; case IP_RECVOPTS: inet_assign_bit(RECVOPTS, sk, val); return 0; case IP_RETOPTS: inet_assign_bit(RETOPTS, sk, val); return 0; case IP_PASSSEC: inet_assign_bit(PASSSEC, sk, val); return 0; case IP_RECVORIGDSTADDR: inet_assign_bit(ORIGDSTADDR, sk, val); return 0; case IP_RECVFRAGSIZE: if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM) return -EINVAL; inet_assign_bit(RECVFRAGSIZE, sk, val); return 0; case IP_RECVERR: inet_assign_bit(RECVERR, sk, val); if (!val) skb_errqueue_purge(&sk->sk_error_queue); return 0; case IP_RECVERR_RFC4884: if (val < 0 || val > 1) return -EINVAL; inet_assign_bit(RECVERR_RFC4884, sk, val); return 0; case IP_FREEBIND: if (optlen < 1) return -EINVAL; inet_assign_bit(FREEBIND, sk, val); return 0; case IP_HDRINCL: if (sk->sk_type != SOCK_RAW) return -ENOPROTOOPT; inet_assign_bit(HDRINCL, sk, val); return 0; case IP_MULTICAST_LOOP: if (optlen < 1) return -EINVAL; inet_assign_bit(MC_LOOP, sk, val); return 0; case IP_MULTICAST_ALL: if (optlen < 1) return -EINVAL; if (val != 0 && val != 1) return -EINVAL; inet_assign_bit(MC_ALL, sk, val); return 0; case IP_TRANSPARENT: if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (optlen < 1) return -EINVAL; inet_assign_bit(TRANSPARENT, sk, val); return 0; case IP_NODEFRAG: if (sk->sk_type != SOCK_RAW) return -ENOPROTOOPT; inet_assign_bit(NODEFRAG, sk, val); return 0; case IP_BIND_ADDRESS_NO_PORT: inet_assign_bit(BIND_ADDRESS_NO_PORT, sk, val); return 0; case IP_TTL: if (optlen < 1) return -EINVAL; if (val != -1 && (val < 1 || val > 255)) return -EINVAL; WRITE_ONCE(inet->uc_ttl, val); return 0; case IP_MINTTL: if (optlen < 1) return -EINVAL; if (val < 0 || val > 255) return -EINVAL; if (val) static_branch_enable(&ip4_min_ttl); WRITE_ONCE(inet->min_ttl, val); return 0; case IP_MULTICAST_TTL: if (sk->sk_type == SOCK_STREAM) return -EINVAL; if (optlen < 1) return -EINVAL; if (val == -1) val = 1; if (val < 0 || val > 255) return -EINVAL; WRITE_ONCE(inet->mc_ttl, val); return 0; case IP_MTU_DISCOVER: return ip_sock_set_mtu_discover(sk, val); case IP_TOS: /* This sets both TOS and Precedence */ ip_sock_set_tos(sk, val); return 0; case IP_LOCAL_PORT_RANGE: { u16 lo = val; u16 hi = val >> 16; if (optlen != sizeof(u32)) return -EINVAL; if (lo != 0 && hi != 0 && lo > hi) return -EINVAL; WRITE_ONCE(inet->local_port_range, val); return 0; } } err = 0; if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); switch (optname) { case IP_OPTIONS: { struct ip_options_rcu *old, *opt = NULL; if (optlen > 40) goto e_inval; err = ip_options_get(sock_net(sk), &opt, optval, optlen); if (err) break; old = rcu_dereference_protected(inet->inet_opt, lockdep_sock_is_held(sk)); if (inet_test_bit(IS_ICSK, sk)) { struct inet_connection_sock *icsk = inet_csk(sk); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == PF_INET || (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet->inet_daddr != LOOPBACK4_IPV6)) { #endif if (old) icsk->icsk_ext_hdr_len -= old->opt.optlen; if (opt) icsk->icsk_ext_hdr_len += opt->opt.optlen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); #if IS_ENABLED(CONFIG_IPV6) } #endif } rcu_assign_pointer(inet->inet_opt, opt); if (old) kfree_rcu(old, rcu); break; } case IP_CHECKSUM: if (val) { if (!(inet_test_bit(CHECKSUM, sk))) { inet_inc_convert_csum(sk); inet_set_bit(CHECKSUM, sk); } } else { if (inet_test_bit(CHECKSUM, sk)) { inet_dec_convert_csum(sk); inet_clear_bit(CHECKSUM, sk); } } break; case IP_UNICAST_IF: { struct net_device *dev = NULL; int ifindex; int midx; if (optlen != sizeof(int)) goto e_inval; ifindex = (__force int)ntohl((__force __be32)val); if (ifindex == 0) { WRITE_ONCE(inet->uc_index, 0); err = 0; break; } dev = dev_get_by_index(sock_net(sk), ifindex); err = -EADDRNOTAVAIL; if (!dev) break; midx = l3mdev_master_ifindex(dev); dev_put(dev); err = -EINVAL; if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if) break; WRITE_ONCE(inet->uc_index, ifindex); err = 0; break; } case IP_MULTICAST_IF: { struct ip_mreqn mreq; struct net_device *dev = NULL; int midx; if (sk->sk_type == SOCK_STREAM) goto e_inval; /* * Check the arguments are allowable */ if (optlen < sizeof(struct in_addr)) goto e_inval; err = -EFAULT; if (optlen >= sizeof(struct ip_mreqn)) { if (copy_from_sockptr(&mreq, optval, sizeof(mreq))) break; } else { memset(&mreq, 0, sizeof(mreq)); if (optlen >= sizeof(struct ip_mreq)) { if (copy_from_sockptr(&mreq, optval, sizeof(struct ip_mreq))) break; } else if (optlen >= sizeof(struct in_addr)) { if (copy_from_sockptr(&mreq.imr_address, optval, sizeof(struct in_addr))) break; } } if (!mreq.imr_ifindex) { if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) { WRITE_ONCE(inet->mc_index, 0); WRITE_ONCE(inet->mc_addr, 0); err = 0; break; } dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr); if (dev) mreq.imr_ifindex = dev->ifindex; } else dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex); err = -EADDRNOTAVAIL; if (!dev) break; midx = l3mdev_master_ifindex(dev); dev_put(dev); err = -EINVAL; if (sk->sk_bound_dev_if && mreq.imr_ifindex != sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if) break; WRITE_ONCE(inet->mc_index, mreq.imr_ifindex); WRITE_ONCE(inet->mc_addr, mreq.imr_address.s_addr); err = 0; break; } case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: { struct ip_mreqn mreq; err = -EPROTO; if (inet_test_bit(IS_ICSK, sk)) break; if (optlen < sizeof(struct ip_mreq)) goto e_inval; err = -EFAULT; if (optlen >= sizeof(struct ip_mreqn)) { if (copy_from_sockptr(&mreq, optval, sizeof(mreq))) break; } else { memset(&mreq, 0, sizeof(mreq)); if (copy_from_sockptr(&mreq, optval, sizeof(struct ip_mreq))) break; } if (optname == IP_ADD_MEMBERSHIP) err = ip_mc_join_group(sk, &mreq); else err = ip_mc_leave_group(sk, &mreq); break; } case IP_MSFILTER: { struct ip_msfilter *msf; if (optlen < IP_MSFILTER_SIZE(0)) goto e_inval; if (optlen > READ_ONCE(net->core.sysctl_optmem_max)) { err = -ENOBUFS; break; } msf = memdup_sockptr(optval, optlen); if (IS_ERR(msf)) { err = PTR_ERR(msf); break; } /* numsrc >= (1G-4) overflow in 32 bits */ if (msf->imsf_numsrc >= 0x3ffffffcU || msf->imsf_numsrc > READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) { kfree(msf); err = -ENOBUFS; break; } if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) { kfree(msf); err = -EINVAL; break; } err = ip_mc_msfilter(sk, msf, 0); kfree(msf); break; } case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: { struct ip_mreq_source mreqs; int omode, add; if (optlen != sizeof(struct ip_mreq_source)) goto e_inval; if (copy_from_sockptr(&mreqs, optval, sizeof(mreqs))) { err = -EFAULT; break; } if (optname == IP_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == IP_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) { struct ip_mreqn mreq; mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; mreq.imr_address.s_addr = mreqs.imr_interface; mreq.imr_ifindex = 0; err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; omode = MCAST_INCLUDE; add = 1; } else /* IP_DROP_SOURCE_MEMBERSHIP */ { omode = MCAST_INCLUDE; add = 0; } err = ip_mc_source(add, omode, sk, &mreqs, 0); break; } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: if (in_compat_syscall()) err = compat_ip_mcast_join_leave(sk, optname, optval, optlen); else err = ip_mcast_join_leave(sk, optname, optval, optlen); break; case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: err = do_mcast_group_source(sk, optname, optval, optlen); break; case MCAST_MSFILTER: if (in_compat_syscall()) err = compat_ip_set_mcast_msfilter(sk, optval, optlen); else err = ip_set_mcast_msfilter(sk, optval, optlen); break; case IP_IPSEC_POLICY: case IP_XFRM_POLICY: err = -EPERM; if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) break; err = xfrm_user_policy(sk, optname, optval, optlen); break; default: err = -ENOPROTOOPT; break; } sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return err; e_inval: sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return -EINVAL; } /** * ipv4_pktinfo_prepare - transfer some info from rtable to skb * @sk: socket * @skb: buffer * @drop_dst: if true, drops skb dst * * To support IP_CMSG_PKTINFO option, we store rt_iif and specific * destination in skb->cb[] before dst drop. * This way, receiver doesn't make cache line misses to read rtable. */ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb, bool drop_dst) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); bool prepare = inet_test_bit(PKTINFO, sk) || ipv6_sk_rxinfo(sk); if (prepare && skb_rtable(skb)) { /* skb->cb is overloaded: prior to this point it is IP{6}CB * which has interface index (iif) as the first member of the * underlying inet{6}_skb_parm struct. This code then overlays * PKTINFO_SKB_CB and in_pktinfo also has iif as the first * element so the iif is picked up from the prior IPCB. If iif * is the loopback interface, then return the sending interface * (e.g., process binds socket to eth0 for Tx which is * redirected to loopback in the rtable/dst). */ struct rtable *rt = skb_rtable(skb); bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags); if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX) pktinfo->ipi_ifindex = inet_iif(skb); else if (l3slave && rt && rt->rt_iif) pktinfo->ipi_ifindex = rt->rt_iif; pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { pktinfo->ipi_ifindex = 0; pktinfo->ipi_spec_dst.s_addr = 0; } if (drop_dst) skb_dst_drop(skb); } int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; if (level != SOL_IP) return -ENOPROTOOPT; err = do_ip_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_HDRINCL && optname != IP_IPSEC_POLICY && optname != IP_XFRM_POLICY && !ip_mroute_opt(optname)) err = nf_setsockopt(sk, PF_INET, optname, optval, optlen); #endif return err; } EXPORT_SYMBOL(ip_setsockopt); /* * Get the options. Note for future reference. The GET of IP options gets * the _received_ ones. The set sets the _sent_ ones. */ static bool getsockopt_needs_rtnl(int optname) { switch (optname) { case IP_MSFILTER: case MCAST_MSFILTER: return true; } return false; } static int ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter gsf; int num, gsf_size; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; num = gsf.gf_numsrc; err = ip_mc_gsfget(sk, &gsf, optval, offsetof(struct group_filter, gf_slist_flex)); if (err) return err; if (gsf.gf_numsrc < num) num = gsf.gf_numsrc; gsf_size = GROUP_FILTER_SIZE(num); if (copy_to_sockptr(optlen, &gsf_size, sizeof(int)) || copy_to_sockptr(optval, &gsf, size0)) return -EFAULT; return 0; } static int compat_ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter gf32; struct group_filter gf; int num; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; gf.gf_fmode = gf32.gf_fmode; num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; err = ip_mc_gsfget(sk, &gf, optval, offsetof(struct compat_group_filter, gf_slist_flex)); if (err) return err; if (gf.gf_numsrc < num) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32)); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), &gf.gf_fmode, sizeof(gf.gf_fmode)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), &gf.gf_numsrc, sizeof(gf.gf_numsrc))) return -EFAULT; return 0; } int do_ip_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct inet_sock *inet = inet_sk(sk); bool needs_rtnl = getsockopt_needs_rtnl(optname); int val, err = 0; int len; if (level != SOL_IP) return -EOPNOTSUPP; if (ip_mroute_opt(optname)) return ip_mroute_getsockopt(sk, optname, optval, optlen); if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; /* Handle options that can be read without locking the socket. */ switch (optname) { case IP_PKTINFO: val = inet_test_bit(PKTINFO, sk); goto copyval; case IP_RECVTTL: val = inet_test_bit(TTL, sk); goto copyval; case IP_RECVTOS: val = inet_test_bit(TOS, sk); goto copyval; case IP_RECVOPTS: val = inet_test_bit(RECVOPTS, sk); goto copyval; case IP_RETOPTS: val = inet_test_bit(RETOPTS, sk); goto copyval; case IP_PASSSEC: val = inet_test_bit(PASSSEC, sk); goto copyval; case IP_RECVORIGDSTADDR: val = inet_test_bit(ORIGDSTADDR, sk); goto copyval; case IP_CHECKSUM: val = inet_test_bit(CHECKSUM, sk); goto copyval; case IP_RECVFRAGSIZE: val = inet_test_bit(RECVFRAGSIZE, sk); goto copyval; case IP_RECVERR: val = inet_test_bit(RECVERR, sk); goto copyval; case IP_RECVERR_RFC4884: val = inet_test_bit(RECVERR_RFC4884, sk); goto copyval; case IP_FREEBIND: val = inet_test_bit(FREEBIND, sk); goto copyval; case IP_HDRINCL: val = inet_test_bit(HDRINCL, sk); goto copyval; case IP_MULTICAST_LOOP: val = inet_test_bit(MC_LOOP, sk); goto copyval; case IP_MULTICAST_ALL: val = inet_test_bit(MC_ALL, sk); goto copyval; case IP_TRANSPARENT: val = inet_test_bit(TRANSPARENT, sk); goto copyval; case IP_NODEFRAG: val = inet_test_bit(NODEFRAG, sk); goto copyval; case IP_BIND_ADDRESS_NO_PORT: val = inet_test_bit(BIND_ADDRESS_NO_PORT, sk); goto copyval; case IP_ROUTER_ALERT: val = inet_test_bit(RTALERT, sk); goto copyval; case IP_TTL: val = READ_ONCE(inet->uc_ttl); if (val < 0) val = READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_default_ttl); goto copyval; case IP_MINTTL: val = READ_ONCE(inet->min_ttl); goto copyval; case IP_MULTICAST_TTL: val = READ_ONCE(inet->mc_ttl); goto copyval; case IP_MTU_DISCOVER: val = READ_ONCE(inet->pmtudisc); goto copyval; case IP_TOS: val = READ_ONCE(inet->tos); goto copyval; case IP_OPTIONS: { unsigned char optbuf[sizeof(struct ip_options)+40]; struct ip_options *opt = (struct ip_options *)optbuf; struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); opt->optlen = 0; if (inet_opt) memcpy(optbuf, &inet_opt->opt, sizeof(struct ip_options) + inet_opt->opt.optlen); rcu_read_unlock(); if (opt->optlen == 0) { len = 0; return copy_to_sockptr(optlen, &len, sizeof(int)); } ip_options_undo(opt); len = min_t(unsigned int, len, opt->optlen); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, opt->__data, len)) return -EFAULT; return 0; } case IP_MTU: { struct dst_entry *dst; val = 0; dst = sk_dst_get(sk); if (dst) { val = dst_mtu(dst); dst_release(dst); } if (!val) return -ENOTCONN; goto copyval; } case IP_PKTOPTIONS: { struct msghdr msg; if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; if (optval.is_kernel) { msg.msg_control_is_user = false; msg.msg_control = optval.kernel; } else { msg.msg_control_is_user = true; msg.msg_control_user = optval.user; } msg.msg_controllen = len; msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; if (inet_test_bit(PKTINFO, sk)) { struct in_pktinfo info; info.ipi_addr.s_addr = READ_ONCE(inet->inet_rcv_saddr); info.ipi_spec_dst.s_addr = READ_ONCE(inet->inet_rcv_saddr); info.ipi_ifindex = READ_ONCE(inet->mc_index); put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } if (inet_test_bit(TTL, sk)) { int hlim = READ_ONCE(inet->mc_ttl); put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); } if (inet_test_bit(TOS, sk)) { int tos = READ_ONCE(inet->rcv_tos); put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos); } len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IP_UNICAST_IF: val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index)); goto copyval; case IP_MULTICAST_IF: { struct in_addr addr; len = min_t(unsigned int, len, sizeof(struct in_addr)); addr.s_addr = READ_ONCE(inet->mc_addr); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &addr, len)) return -EFAULT; return 0; } case IP_LOCAL_PORT_RANGE: val = READ_ONCE(inet->local_port_range); goto copyval; } if (needs_rtnl) rtnl_lock(); sockopt_lock_sock(sk); switch (optname) { case IP_MSFILTER: { struct ip_msfilter msf; if (len < IP_MSFILTER_SIZE(0)) { err = -EINVAL; goto out; } if (copy_from_sockptr(&msf, optval, IP_MSFILTER_SIZE(0))) { err = -EFAULT; goto out; } err = ip_mc_msfget(sk, &msf, optval, optlen); goto out; } case MCAST_MSFILTER: if (in_compat_syscall()) err = compat_ip_get_mcast_msfilter(sk, optval, optlen, len); else err = ip_get_mcast_msfilter(sk, optval, optlen, len); goto out; case IP_PROTOCOL: val = inet_sk(sk)->inet_num; break; default: sockopt_release_sock(sk); return -ENOPROTOOPT; } sockopt_release_sock(sk); copyval: if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) { unsigned char ucval = (unsigned char)val; len = 1; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &ucval, 1)) return -EFAULT; } else { len = min_t(unsigned int, sizeof(int), len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; } return 0; out: sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return err; } int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { int err; err = do_ip_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS && !ip_mroute_opt(optname)) { int len; if (get_user(len, optlen)) return -EFAULT; err = nf_getsockopt(sk, PF_INET, optname, optval, &len); if (err >= 0) err = put_user(len, optlen); return err; } #endif return err; } EXPORT_SYMBOL(ip_getsockopt);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CONTEXT_TRACKING_STATE_H #define _LINUX_CONTEXT_TRACKING_STATE_H #include <linux/percpu.h> #include <linux/static_key.h> #include <linux/context_tracking_irq.h> /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ #define CT_NESTING_IRQ_NONIDLE ((LONG_MAX / 2) + 1) enum ctx_state { CT_STATE_DISABLED = -1, /* returned by ct_state() if unknown */ CT_STATE_KERNEL = 0, CT_STATE_IDLE = 1, CT_STATE_USER = 2, CT_STATE_GUEST = 3, CT_STATE_MAX = 4, }; struct context_tracking { #ifdef CONFIG_CONTEXT_TRACKING_USER /* * When active is false, probes are unset in order * to minimize overhead: TIF flags are cleared * and calls to user_enter/exit are ignored. This * may be further optimized using static keys. */ bool active; int recursion; #endif #ifdef CONFIG_CONTEXT_TRACKING atomic_t state; #endif #ifdef CONFIG_CONTEXT_TRACKING_IDLE long nesting; /* Track process nesting level. */ long nmi_nesting; /* Track irq/NMI nesting level. */ #endif }; /* * We cram two different things within the same atomic variable: * * CT_RCU_WATCHING_START CT_STATE_START * | | * v v * MSB [ RCU watching counter ][ context_state ] LSB * ^ ^ * | | * CT_RCU_WATCHING_END CT_STATE_END * * Bits are used from the LSB upwards, so unused bits (if any) will always be in * upper bits of the variable. */ #ifdef CONFIG_CONTEXT_TRACKING #define CT_SIZE (sizeof(((struct context_tracking *)0)->state) * BITS_PER_BYTE) #define CT_STATE_WIDTH bits_per(CT_STATE_MAX - 1) #define CT_STATE_START 0 #define CT_STATE_END (CT_STATE_START + CT_STATE_WIDTH - 1) #define CT_RCU_WATCHING_MAX_WIDTH (CT_SIZE - CT_STATE_WIDTH) #define CT_RCU_WATCHING_WIDTH (IS_ENABLED(CONFIG_RCU_DYNTICKS_TORTURE) ? 2 : CT_RCU_WATCHING_MAX_WIDTH) #define CT_RCU_WATCHING_START (CT_STATE_END + 1) #define CT_RCU_WATCHING_END (CT_RCU_WATCHING_START + CT_RCU_WATCHING_WIDTH - 1) #define CT_RCU_WATCHING BIT(CT_RCU_WATCHING_START) #define CT_STATE_MASK GENMASK(CT_STATE_END, CT_STATE_START) #define CT_RCU_WATCHING_MASK GENMASK(CT_RCU_WATCHING_END, CT_RCU_WATCHING_START) #define CT_UNUSED_WIDTH (CT_RCU_WATCHING_MAX_WIDTH - CT_RCU_WATCHING_WIDTH) static_assert(CT_STATE_WIDTH + CT_RCU_WATCHING_WIDTH + CT_UNUSED_WIDTH == CT_SIZE); DECLARE_PER_CPU(struct context_tracking, context_tracking); #endif /* CONFIG_CONTEXT_TRACKING */ #ifdef CONFIG_CONTEXT_TRACKING_USER static __always_inline int __ct_state(void) { return raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_STATE_MASK; } #endif #ifdef CONFIG_CONTEXT_TRACKING_IDLE static __always_inline int ct_rcu_watching(void) { return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING_MASK; } static __always_inline int ct_rcu_watching_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); return atomic_read(&ct->state) & CT_RCU_WATCHING_MASK; } static __always_inline int ct_rcu_watching_cpu_acquire(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); return atomic_read_acquire(&ct->state) & CT_RCU_WATCHING_MASK; } static __always_inline long ct_nesting(void) { return __this_cpu_read(context_tracking.nesting); } static __always_inline long ct_nesting_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); return ct->nesting; } static __always_inline long ct_nmi_nesting(void) { return __this_cpu_read(context_tracking.nmi_nesting); } static __always_inline long ct_nmi_nesting_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); return ct->nmi_nesting; } #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ #ifdef CONFIG_CONTEXT_TRACKING_USER extern struct static_key_false context_tracking_key; static __always_inline bool context_tracking_enabled(void) { return static_branch_unlikely(&context_tracking_key); } static __always_inline bool context_tracking_enabled_cpu(int cpu) { return context_tracking_enabled() && per_cpu(context_tracking.active, cpu); } static __always_inline bool context_tracking_enabled_this_cpu(void) { return context_tracking_enabled() && __this_cpu_read(context_tracking.active); } /** * ct_state() - return the current context tracking state if known * * Returns the current cpu's context tracking state if context tracking * is enabled. If context tracking is disabled, returns * CT_STATE_DISABLED. This should be used primarily for debugging. */ static __always_inline int ct_state(void) { int ret; if (!context_tracking_enabled()) return CT_STATE_DISABLED; preempt_disable(); ret = __ct_state(); preempt_enable(); return ret; } #else static __always_inline bool context_tracking_enabled(void) { return false; } static __always_inline bool context_tracking_enabled_cpu(int cpu) { return false; } static __always_inline bool context_tracking_enabled_this_cpu(void) { return false; } #endif /* CONFIG_CONTEXT_TRACKING_USER */ #endif
1629 1628 338 338 537 29 537 1816 1816 1432 1429 1433 1432 1423 1432 1431 1431 27 27 27 1431 2 806 807 807 806 1106 537 2 2 537 537 537 254 254 253 1104 298 537 1105 258 258 254 254 254 254 254 2 2 1332 534 535 534 312 268 1330 533 535 535 312 312 312 312 54 235 236 236 236 235 236 236 236 900 900 898 900 899 900 1 10 4 6 6 6 900 1236 764 904 493 784 783 18 785 37 782 782 784 693 1115 1123 1440 296 6 4 4 4 6 4 4 6 789 789 788 773 773 686 789 585 583 577 2 2 2 2 2 2 577 577 576 308 271 272 273 530 577 585 529 585 585 528 585 585 585 585 627 627 626 583 585 188 585 585 188 585 134 134 134 134 134 132 132 125 125 3 3 3 3 3 127 134 134 133 135 203 203 203 134 135 135 298 6647 6909 2912 880 604 298 298 2644 5036 4948 4353 2464 2465 2468 282 280 282 5036 70 70 70 2588 12 2730 341 341 341 341 341 339 340 297 298 35 341 341 340 156 341 341 341 341 37 2591 2592 2594 2228 2594 2565 2569 442 442 442 442 2594 2593 442 4835 4842 4842 4820 4755 246 871 2168 2117 2035 636 636 636 599 434 434 405 602 104 76 76 2006 2121 2117 434 975 219 219 997 998 238 999 999 998 996 884 2 2 997 86 6 80 80 80 80 976 976 976 709 976 974 976 976 976 672 820 975 874 975 975 709 709 709 10 10 10 5 4 4 10 3 7 704 652 703 287 287 287 704 652 702 704 701 703 704 573 973 973 973 12 973 973 943 999 999 234 1139 1004 1005 992 278 278 131 278 20 278 278 278 156 115 43 42 43 43 43 43 1 182 182 182 138 92 13 7 7 7 7 20 19 20 175 182 127 155 175 165 174 177 182 61 179 19 12 1 1 175 132 5 120 104 173 6 57 177 176 177 178 168 177 168 178 131 178 171 178 178 178 19 171 178 11 178 277 277 278 278 278 234 93 277 159 338 338 12 900 900 117 116 3 3 921 920 131 900 920 202 920 920 921 920 920 745 921 893 918 897 838 840 837 840 839 824 27 1 26 839 832 831 830 377 829 817 830 825 837 838 839 900 1291 1291 900 825 322 322 131 200 12 12 12 11 12 164 164 163 164 164 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 /* * mm/rmap.c - physical to virtual reverse mappings * * Copyright 2001, Rik van Riel <riel@conectiva.com.br> * Released under the General Public License (GPL). * * Simple, low overhead reverse mapping scheme. * Please try to keep this thing as modular as possible. * * Provides methods for unmapping each kind of mapped page: * the anon methods track anonymous pages, and * the file methods track pages belonging to an inode. * * Original design by Rik van Riel <riel@conectiva.com.br> 2001 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 * Contributions by Hugh Dickins 2003, 2004 */ /* * Lock ordering in mm: * * inode->i_rwsem (while writing or truncating, not reading or faulting) * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) * folio_lock * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) * vma_start_write * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in block_dirty_folio) * i_pages lock (widely used) * lruvec->lru_lock (in folio_lruvec_lock_irq) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * i_pages lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock * * hugetlbfs PageHuge() take locks in this order: * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * vma_lock (hugetlb specific lock for pmd_sharing) * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) * folio_lock */ #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/leafops.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/backing-dev.h> #include <linux/page_idle.h> #include <linux/memremap.h> #include <linux/userfaultfd_k.h> #include <linux/mm_inline.h> #include <linux/oom.h> #include <asm/tlbflush.h> #define CREATE_TRACE_POINTS #include <trace/events/migrate.h> #include "internal.h" static struct kmem_cache *anon_vma_cachep; static struct kmem_cache *anon_vma_chain_cachep; static inline struct anon_vma *anon_vma_alloc(void) { struct anon_vma *anon_vma; anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { atomic_set(&anon_vma->refcount, 1); anon_vma->num_children = 0; anon_vma->num_active_vmas = 0; anon_vma->parent = anon_vma; /* * Initialise the anon_vma root to point to itself. If called * from fork, the root will be reset to the parents anon_vma. */ anon_vma->root = anon_vma; } return anon_vma; } static inline void anon_vma_free(struct anon_vma *anon_vma) { VM_BUG_ON(atomic_read(&anon_vma->refcount)); /* * Synchronize against folio_lock_anon_vma_read() such that * we can safely hold the lock without the anon_vma getting * freed. * * Relies on the full mb implied by the atomic_dec_and_test() from * put_anon_vma() against the acquire barrier implied by * down_read_trylock() from folio_lock_anon_vma_read(). This orders: * * folio_lock_anon_vma_read() VS put_anon_vma() * down_read_trylock() atomic_dec_and_test() * LOCK MB * atomic_read() rwsem_is_locked() * * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ might_sleep(); if (rwsem_is_locked(&anon_vma->root->rwsem)) { anon_vma_lock_write(anon_vma); anon_vma_unlock_write(anon_vma); } kmem_cache_free(anon_vma_cachep, anon_vma); } static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) { return kmem_cache_alloc(anon_vma_chain_cachep, gfp); } static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) { kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } static void anon_vma_chain_link(struct vm_area_struct *vma, struct anon_vma_chain *avc, struct anon_vma *anon_vma) { avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); } /** * __anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question * * This makes sure the memory mapping described by 'vma' has * an 'anon_vma' attached to it, so that we can associate the * anonymous pages mapped into it with that anon_vma. * * The common case will be that we already have one, which * is handled inline by anon_vma_prepare(). But if * not we either need to find an adjacent mapping that we * can re-use the anon_vma from (very common when the only * reason for splitting a vma has been mprotect()), or we * allocate a new one. * * Anon-vma allocations are very subtle, because we may have * optimistically looked up an anon_vma in folio_lock_anon_vma_read() * and that may actually touch the rwsem even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). * * As a result, we need to do proper anon_vma locking even * for the new allocation. At the same time, we do not want * to do any locking for the common case of already having * an anon_vma. */ int __anon_vma_prepare(struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; struct anon_vma *anon_vma, *allocated; struct anon_vma_chain *avc; mmap_assert_locked(mm); might_sleep(); avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_enomem; anon_vma = find_mergeable_anon_vma(vma); allocated = NULL; if (!anon_vma) { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) goto out_enomem_free_avc; anon_vma->num_children++; /* self-parent link for new root */ allocated = anon_vma; } anon_vma_lock_write(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; anon_vma_chain_link(vma, avc, anon_vma); anon_vma->num_active_vmas++; allocated = NULL; avc = NULL; } spin_unlock(&mm->page_table_lock); anon_vma_unlock_write(anon_vma); if (unlikely(allocated)) put_anon_vma(allocated); if (unlikely(avc)) anon_vma_chain_free(avc); return 0; out_enomem_free_avc: anon_vma_chain_free(avc); out_enomem: return -ENOMEM; } /* * This is a useful helper function for locking the anon_vma root as * we traverse the vma->anon_vma_chain, looping over anon_vma's that * have the same vma. * * Such anon_vma's should have the same root, so you'd expect to see * just a single mutex_lock for the whole traversal. */ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) { struct anon_vma *new_root = anon_vma->root; if (new_root != root) { if (WARN_ON_ONCE(root)) up_write(&root->rwsem); root = new_root; down_write(&root->rwsem); } return root; } static inline void unlock_anon_vma_root(struct anon_vma *root) { if (root) up_write(&root->rwsem); } /* * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. * * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before * call, we can identify this case by checking (!dst->anon_vma && * src->anon_vma). * * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find * and reuse existing anon_vma which has no vmas and only one child anon_vma. * This prevents degradation of anon_vma hierarchy to endless linear chain in * case of constantly forking task. On the other hand, an anon_vma with more * than one child isn't reused even if there was no alive vma, thus rmap * walker has a good chance of avoiding scanning the whole hierarchy when it * searches where page is mapped. */ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; struct anon_vma *root = NULL; list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; avc = anon_vma_chain_alloc(GFP_NOWAIT); if (unlikely(!avc)) { unlock_anon_vma_root(root); root = NULL; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto enomem_failure; } anon_vma = pavc->anon_vma; root = lock_anon_vma_root(root, anon_vma); anon_vma_chain_link(dst, avc, anon_vma); /* * Reuse existing anon_vma if it has no vma and only one * anon_vma child. * * Root anon_vma is never reused: * it has self-parent reference and at least one child. */ if (!dst->anon_vma && src->anon_vma && anon_vma->num_children < 2 && anon_vma->num_active_vmas == 0) dst->anon_vma = anon_vma; } if (dst->anon_vma) dst->anon_vma->num_active_vmas++; unlock_anon_vma_root(root); return 0; enomem_failure: /* * dst->anon_vma is dropped here otherwise its num_active_vmas can * be incorrectly decremented in unlink_anon_vmas(). * We can safely do this because callers of anon_vma_clone() don't care * about dst->anon_vma if anon_vma_clone() failed. */ dst->anon_vma = NULL; unlink_anon_vmas(dst); return -ENOMEM; } /* * Attach vma to its own anon_vma, as well as to the anon_vmas that * the corresponding VMA in the parent process is attached to. * Returns 0 on success, non-zero on failure. */ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { struct anon_vma_chain *avc; struct anon_vma *anon_vma; int error; /* Don't bother if the parent process has no anon_vma here. */ if (!pvma->anon_vma) return 0; /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ vma->anon_vma = NULL; /* * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ error = anon_vma_clone(vma, pvma); if (error) return error; /* An existing anon_vma has been reused, all done then. */ if (vma->anon_vma) return 0; /* Then add our own anon_vma. */ anon_vma = anon_vma_alloc(); if (!anon_vma) goto out_error; anon_vma->num_active_vmas++; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_error_free_anon_vma; /* * The root anon_vma's rwsem is the lock actually used when we * lock any of the anon_vmas in this anon_vma tree. */ anon_vma->root = pvma->anon_vma->root; anon_vma->parent = pvma->anon_vma; /* * With refcounts, an anon_vma can stay around longer than the * process it belongs to. The root anon_vma needs to be pinned until * this anon_vma is freed, because the lock lives in the root. */ get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); anon_vma->parent->num_children++; anon_vma_unlock_write(anon_vma); return 0; out_error_free_anon_vma: put_anon_vma(anon_vma); out_error: unlink_anon_vmas(vma); return -ENOMEM; } void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; struct anon_vma *root = NULL; /* * Unlink each anon_vma chained to the VMA. This list is ordered * from newest to oldest, ensuring the root anon_vma gets freed last. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; root = lock_anon_vma_root(root, anon_vma); anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); /* * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { anon_vma->parent->num_children--; continue; } list_del(&avc->same_vma); anon_vma_chain_free(avc); } if (vma->anon_vma) { vma->anon_vma->num_active_vmas--; /* * vma would still be needed after unlink, and anon_vma will be prepared * when handle fault. */ vma->anon_vma = NULL; } unlock_anon_vma_root(root); /* * Iterate the list once more, it now only contains empty and unlinked * anon_vmas, destroy them. Could not do before due to __put_anon_vma() * needing to write-acquire the anon_vma->root->rwsem. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; VM_WARN_ON(anon_vma->num_children); VM_WARN_ON(anon_vma->num_active_vmas); put_anon_vma(anon_vma); list_del(&avc->same_vma); anon_vma_chain_free(avc); } } static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; init_rwsem(&anon_vma->rwsem); atomic_set(&anon_vma->refcount, 0); anon_vma->rb_root = RB_ROOT_CACHED; } void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, anon_vma_ctor); anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC|SLAB_ACCOUNT); } /* * Getting a lock on a stable anon_vma from a page off the LRU is tricky! * * Since there is no serialization what so ever against folio_remove_rmap_*() * the best this function can do is return a refcount increased anon_vma * that might have been relevant to this page. * * The page might have been remapped to a different anon_vma or the anon_vma * returned may already be freed (and even reused). * * In case it was remapped to a different anon_vma, the new anon_vma will be a * child of the old anon_vma, and the anon_vma lifetime rules will therefore * ensure that any anon_vma obtained from the page will still be valid for as * long as we observe page_mapped() [ hence all those page_mapped() tests ]. * * All users of this function must be very careful when walking the anon_vma * chain and verify that the page in question is indeed mapped in it * [ something equivalent to page_mapped_in_vma() ]. * * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from * folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid * if there is a mapcount, we can dereference the anon_vma after observing * those. * * NOTE: the caller should hold folio lock when calling this. */ struct anon_vma *folio_get_anon_vma(const struct folio *folio) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } /* * If this folio is still mapped, then its anon_vma cannot have been * freed. But if it has been unmapped, we have no security against the * anon_vma structure being freed and reused (for another anon_vma: * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() * above cannot corrupt). */ if (!folio_mapped(folio)) { rcu_read_unlock(); put_anon_vma(anon_vma); return NULL; } out: rcu_read_unlock(); return anon_vma; } /* * Similar to folio_get_anon_vma() except it locks the anon_vma. * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a * reference like with folio_get_anon_vma() and then block on the mutex * on !rwc->try_lock case. */ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; unsigned long anon_mapping; VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); root_anon_vma = READ_ONCE(anon_vma->root); if (down_read_trylock(&root_anon_vma->rwsem)) { /* * If the folio is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). */ if (!folio_mapped(folio)) { up_read(&root_anon_vma->rwsem); anon_vma = NULL; } goto out; } if (rwc && rwc->try_lock) { anon_vma = NULL; rwc->contended = true; goto out; } /* trylock failed, we got to sleep */ if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } if (!folio_mapped(folio)) { rcu_read_unlock(); put_anon_vma(anon_vma); return NULL; } /* we pinned the anon_vma, its safe to sleep */ rcu_read_unlock(); anon_vma_lock_read(anon_vma); if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because * we'll deadlock on the anon_vma_lock_write() recursion. */ anon_vma_unlock_read(anon_vma); __put_anon_vma(anon_vma); anon_vma = NULL; } return anon_vma; out: rcu_read_unlock(); return anon_vma; } #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* * Flush TLB entries for recently unmapped pages from remote CPUs. It is * important if a PTE was dirty when it was unmapped that it's flushed * before any IO is initiated on the page to prevent lost writes. Similarly, * it must be flushed before freeing to prevent data leakage. */ void try_to_unmap_flush(void) { struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc; if (!tlb_ubc->flush_required) return; arch_tlbbatch_flush(&tlb_ubc->arch); tlb_ubc->flush_required = false; tlb_ubc->writable = false; } /* Flush iff there are potentially writable TLB entries that can race with IO */ void try_to_unmap_flush_dirty(void) { struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc; if (tlb_ubc->writable) try_to_unmap_flush(); } /* * Bits 0-14 of mm->tlb_flush_batched record pending generations. * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations. */ #define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16 #define TLB_FLUSH_BATCH_PENDING_MASK \ ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1) #define TLB_FLUSH_BATCH_PENDING_LARGE \ (TLB_FLUSH_BATCH_PENDING_MASK / 2) static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, unsigned long start, unsigned long end) { struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc; int batch; bool writable = pte_dirty(pteval); if (!pte_accessible(mm, pteval)) return; arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end); tlb_ubc->flush_required = true; /* * Ensure compiler does not re-order the setting of tlb_flush_batched * before the PTE is cleared. */ barrier(); batch = atomic_read(&mm->tlb_flush_batched); retry: if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) { /* * Prevent `pending' from catching up with `flushed' because of * overflow. Reset `pending' and `flushed' to be 1 and 0 if * `pending' becomes large. */ if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1)) goto retry; } else { atomic_inc(&mm->tlb_flush_batched); } /* * If the PTE was dirty then it's best to assume it's writable. The * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() * before the page is queued for IO. */ if (writable) tlb_ubc->writable = true; } /* * Returns true if the TLB flush should be deferred to the end of a batch of * unmap operations to reduce IPIs. */ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) { if (!(flags & TTU_BATCH_FLUSH)) return false; return arch_tlbbatch_should_defer(mm); } /* * Reclaim unmaps pages under the PTL but do not flush the TLB prior to * releasing the PTL if TLB flushes are batched. It's possible for a parallel * operation such as mprotect or munmap to race between reclaim unmapping * the page and flushing the page. If this race occurs, it potentially allows * access to data via a stale TLB entry. Tracking all mm's that have TLB * batching in flight would be expensive during reclaim so instead track * whether TLB batching occurred in the past and if so then do a flush here * if required. This will cost one additional flush per reclaim cycle paid * by the first operation at risk such as mprotect and mumap. * * This must be called under the PTL so that an access to tlb_flush_batched * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise * via the PTL. */ void flush_tlb_batched_pending(struct mm_struct *mm) { int batch = atomic_read(&mm->tlb_flush_batched); int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK; int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; if (pending != flushed) { flush_tlb_mm(mm); /* * If the new TLB flushing is pending during flushing, leave * mm->tlb_flush_batched as is, to avoid losing flushing. */ atomic_cmpxchg(&mm->tlb_flush_batched, batch, pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT)); } } #else static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, unsigned long start, unsigned long end) { } static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) { return false; } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ /** * page_address_in_vma - The virtual address of a page in this VMA. * @folio: The folio containing the page. * @page: The page within the folio. * @vma: The VMA we need to know the address in. * * Calculates the user virtual address of this page in the specified VMA. * It is the caller's responsibility to check the page is actually * within the VMA. There may not currently be a PTE pointing at this * page, but if a page fault occurs at this address, this is the page * which will be accessed. * * Context: Caller should hold a reference to the folio. Caller should * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the * VMA from being altered. * * Return: The virtual address corresponding to this page in the VMA. */ unsigned long page_address_in_vma(const struct folio *folio, const struct page *page, const struct vm_area_struct *vma) { if (folio_test_anon(folio)) { struct anon_vma *anon_vma = folio_anon_vma(folio); /* * Note: swapoff's unuse_vma() is more efficient with this * check, and needs it to match anon_vma when KSM is active. */ if (!vma->anon_vma || !anon_vma || vma->anon_vma->root != anon_vma->root) return -EFAULT; } else if (!vma->vm_file) { return -EFAULT; } else if (vma->vm_file->f_mapping != folio->mapping) { return -EFAULT; } /* KSM folios don't reach here because of the !anon_vma check */ return vma_address(vma, page_pgoff(folio, page), 1); } /* * Returns the actual pmd_t* where we expect 'address' to be mapped from, or * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* * represents. */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd = NULL; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) goto out; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) goto out; pud = pud_offset(p4d, address); if (!pud_present(*pud)) goto out; pmd = pmd_offset(pud, address); out: return pmd; } struct folio_referenced_arg { int mapcount; int referenced; vm_flags_t vm_flags; struct mem_cgroup *memcg; }; /* * arg: folio_referenced_arg will be passed */ static bool folio_referenced_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct folio_referenced_arg *pra = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); int ptes = 0, referenced = 0; while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; if (vma->vm_flags & VM_LOCKED) { ptes++; pra->mapcount--; /* Only mlock fully mapped pages */ if (pvmw.pte && ptes != pvmw.nr_pages) continue; /* * All PTEs must be protected by page table lock in * order to mlock the page. * * If page table boundary has been cross, current ptl * only protect part of ptes. */ if (pvmw.flags & PVMW_PGTABLE_CROSSED) continue; /* Restore the mlock which got missed */ mlock_vma_folio(folio, vma); page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; return false; /* To break the loop */ } /* * Skip the non-shared swapbacked folio mapped solely by * the exiting or OOM-reaped process. This avoids redundant * swap-out followed by an immediate unmap. */ if ((!atomic_read(&vma->vm_mm->mm_users) || check_stable_address_space(vma->vm_mm)) && folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_maybe_mapped_shared(folio)) { pra->referenced = -1; page_vma_mapped_walk_done(&pvmw); return false; } if (lru_gen_enabled() && pvmw.pte) { if (lru_gen_look_around(&pvmw)) referenced++; } else if (pvmw.pte) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) referenced++; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) referenced++; } else { /* unexpected pmd-mapped folio? */ WARN_ON_ONCE(1); } pra->mapcount--; } if (referenced) folio_clear_idle(folio); if (folio_test_clear_young(folio)) referenced++; if (referenced) { pra->referenced++; pra->vm_flags |= vma->vm_flags & ~VM_LOCKED; } if (!pra->mapcount) return false; /* To break the loop */ return true; } static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) { struct folio_referenced_arg *pra = arg; struct mem_cgroup *memcg = pra->memcg; /* * Ignore references from this mapping if it has no recency. If the * folio has been used in another mapping, we will catch it; if this * other mapping is already gone, the unmap path will have set the * referenced flag or activated the folio in zap_pte_range(). */ if (!vma_has_recency(vma)) return true; /* * If we are reclaiming on behalf of a cgroup, skip counting on behalf * of references from different cgroups. */ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) return true; return false; } /** * folio_referenced() - Test if the folio was referenced. * @folio: The folio to test. * @is_locked: Caller holds lock on the folio. * @memcg: target memory cgroup * @vm_flags: A combination of all the vma->vm_flags which referenced the folio. * * Quick test_and_clear_referenced for all mappings of a folio, * * Return: The number of mappings which referenced the folio. Return -1 if * the function bailed out due to rmap lock contention. */ int folio_referenced(struct folio *folio, int is_locked, struct mem_cgroup *memcg, vm_flags_t *vm_flags) { bool we_locked = false; struct folio_referenced_arg pra = { .mapcount = folio_mapcount(folio), .memcg = memcg, }; struct rmap_walk_control rwc = { .rmap_one = folio_referenced_one, .arg = (void *)&pra, .anon_lock = folio_lock_anon_vma_read, .try_lock = true, .invalid_vma = invalid_folio_referenced_vma, }; *vm_flags = 0; if (!pra.mapcount) return 0; if (!folio_raw_mapping(folio)) return 0; if (!is_locked) { we_locked = folio_trylock(folio); if (!we_locked) return 1; } rmap_walk(folio, &rwc); *vm_flags = pra.vm_flags; if (we_locked) folio_unlock(folio); return rwc.contended ? -1 : pra.referenced; } static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) { int cleaned = 0; struct vm_area_struct *vma = pvmw->vma; struct mmu_notifier_range range; unsigned long address = pvmw->address; /* * We have to assume the worse case ie pmd for invalidation. Note that * the folio can not be freed from this function. */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma->vm_mm, address, vma_address_end(pvmw)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(pvmw)) { int ret = 0; address = pvmw->address; if (pvmw->pte) { pte_t *pte = pvmw->pte; pte_t entry = ptep_get(pte); /* * PFN swap PTEs, such as device-exclusive ones, that * actually map pages are clean and not writable from a * CPU perspective. The MMU notifier takes care of any * device aspects. */ if (!pte_present(entry)) continue; if (!pte_dirty(entry) && !pte_write(entry)) continue; flush_cache_page(vma, address, pte_pfn(entry)); entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(vma->vm_mm, address, pte, entry); ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t *pmd = pvmw->pmd; pmd_t entry = pmdp_get(pmd); /* * Please see the comment above (!pte_present). * A non present PMD is not writable from a CPU * perspective. */ if (!pmd_present(entry)) continue; if (!pmd_dirty(entry) && !pmd_write(entry)) continue; flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); entry = pmdp_invalidate(vma, address, pmd); entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); ret = 1; #else /* unexpected pmd-mapped folio? */ WARN_ON_ONCE(1); #endif } if (ret) cleaned++; } mmu_notifier_invalidate_range_end(&range); return cleaned; } static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC); int *cleaned = arg; *cleaned += page_vma_mkclean_one(&pvmw); return true; } static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) { if (vma->vm_flags & VM_SHARED) return false; return true; } int folio_mkclean(struct folio *folio) { int cleaned = 0; struct address_space *mapping; struct rmap_walk_control rwc = { .arg = (void *)&cleaned, .rmap_one = page_mkclean_one, .invalid_vma = invalid_mkclean_vma, }; BUG_ON(!folio_test_locked(folio)); if (!folio_mapped(folio)) return 0; mapping = folio_mapping(folio); if (!mapping) return 0; rmap_walk(folio, &rwc); return cleaned; } EXPORT_SYMBOL_GPL(folio_mkclean); struct wrprotect_file_state { int cleaned; pgoff_t pgoff; unsigned long pfn; unsigned long nr_pages; }; static bool mapping_wrprotect_range_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg; struct page_vma_mapped_walk pvmw = { .pfn = state->pfn, .nr_pages = state->nr_pages, .pgoff = state->pgoff, .vma = vma, .address = address, .flags = PVMW_SYNC, }; state->cleaned += page_vma_mkclean_one(&pvmw); return true; } static void __rmap_walk_file(struct folio *folio, struct address_space *mapping, pgoff_t pgoff_start, unsigned long nr_pages, struct rmap_walk_control *rwc, bool locked); /** * mapping_wrprotect_range() - Write-protect all mappings in a specified range. * * @mapping: The mapping whose reverse mapping should be traversed. * @pgoff: The page offset at which @pfn is mapped within @mapping. * @pfn: The PFN of the page mapped in @mapping at @pgoff. * @nr_pages: The number of physically contiguous base pages spanned. * * Traverses the reverse mapping, finding all VMAs which contain a shared * mapping of the pages in the specified range in @mapping, and write-protects * them (that is, updates the page tables to mark the mappings read-only such * that a write protection fault arises when the mappings are written to). * * The @pfn value need not refer to a folio, but rather can reference a kernel * allocation which is mapped into userland. We therefore do not require that * the page maps to a folio with a valid mapping or index field, rather the * caller specifies these in @mapping and @pgoff. * * Return: the number of write-protected PTEs, or an error. */ int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, unsigned long pfn, unsigned long nr_pages) { struct wrprotect_file_state state = { .cleaned = 0, .pgoff = pgoff, .pfn = pfn, .nr_pages = nr_pages, }; struct rmap_walk_control rwc = { .arg = (void *)&state, .rmap_one = mapping_wrprotect_range_one, .invalid_vma = invalid_mkclean_vma, }; if (!mapping) return 0; __rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc, /* locked = */false); return state.cleaned; } EXPORT_SYMBOL_GPL(mapping_wrprotect_range); /** * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of * [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff) * within the @vma of shared mappings. And since clean PTEs * should also be readonly, write protects them too. * @pfn: start pfn. * @nr_pages: number of physically contiguous pages srarting with @pfn. * @pgoff: page offset that the @pfn mapped with. * @vma: vma that @pfn mapped within. * * Returns the number of cleaned PTEs (including PMDs). */ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma) { struct page_vma_mapped_walk pvmw = { .pfn = pfn, .nr_pages = nr_pages, .pgoff = pgoff, .vma = vma, .flags = PVMW_SYNC, }; if (invalid_mkclean_vma(vma, NULL)) return 0; pvmw.address = vma_address(vma, pgoff, nr_pages); VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma); return page_vma_mkclean_one(&pvmw); } static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) { int idx; if (nr) { idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; lruvec_stat_mod_folio(folio, idx, nr); } if (nr_pmdmapped) { if (folio_test_anon(folio)) { idx = NR_ANON_THPS; lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); } else { /* NR_*_PMDMAPPED are not maintained per-memcg */ idx = folio_test_swapbacked(folio) ? NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED; __mod_node_page_state(folio_pgdat(folio), idx, nr_pmdmapped); } } } static __always_inline void __folio_add_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; const int orig_nr_pages = nr_pages; int first = 0, nr = 0, nr_pmdmapped = 0; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_inc_and_test(&folio->_mapcount); break; } if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma); if (nr == orig_nr_pages) /* Was completely unmapped. */ nr = folio_large_nr_pages(folio); else nr = 0; break; } do { first += atomic_inc_and_test(&page->_mapcount); } while (page++, --nr_pages > 0); if (first && atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED) nr = first; folio_add_large_mapcount(folio, orig_nr_pages, vma); break; case PGTABLE_LEVEL_PMD: case PGTABLE_LEVEL_PUD: first = atomic_inc_and_test(&folio->_entire_mapcount); if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { if (level == PGTABLE_LEVEL_PMD && first) nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_inc_return_large_mapcount(folio, vma); if (nr == 1) /* Was completely unmapped. */ nr = folio_large_nr_pages(folio); else nr = 0; break; } if (first) { nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) { nr_pages = folio_large_nr_pages(folio); /* * We only track PMD mappings of PMD-sized * folios separately. */ if (level == PGTABLE_LEVEL_PMD) nr_pmdmapped = nr_pages; nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; } else { /* Raced ahead of a remove of ENTIRELY_MAPPED */ nr = 0; } } folio_inc_large_mapcount(folio, vma); break; default: BUILD_BUG(); } __folio_mod_stat(folio, nr, nr_pmdmapped); } /** * folio_move_anon_rmap - move a folio to our anon_vma * @folio: The folio to move to our anon_vma * @vma: The vma the folio belongs to * * When a folio belongs exclusively to one process after a COW event, * that folio can be moved into the anon_vma that belongs to just that * process, so the rmap code will not search the parent or sibling processes. */ void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma) { void *anon_vma = vma->anon_vma; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); anon_vma += FOLIO_MAPPING_ANON; /* * Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written * simultaneously, so a concurrent reader (eg folio_referenced()'s * folio_test_anon()) will not see one without the other. */ WRITE_ONCE(folio->mapping, anon_vma); } /** * __folio_set_anon - set up a new anonymous rmap for a folio * @folio: The folio to set up the new anonymous rmap for. * @vma: VM area to add the folio to. * @address: User virtual address of the mapping * @exclusive: Whether the folio is exclusive to the process. */ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, unsigned long address, bool exclusive) { struct anon_vma *anon_vma = vma->anon_vma; BUG_ON(!anon_vma); /* * If the folio isn't exclusive to this vma, we must use the _oldest_ * possible anon_vma for the folio mapping! */ if (!exclusive) anon_vma = anon_vma->root; /* * page_idle does a lockless/optimistic rmap scan on folio->mapping. * Make sure the compiler doesn't split the stores of anon_vma and * the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code * could mistake the mapping for a struct address_space and crash. */ anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON; WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); folio->index = linear_page_index(vma, address); } /** * __page_check_anon_rmap - sanity check anonymous rmap addition * @folio: The folio containing @page. * @page: the page to check the mapping of * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped */ static void __page_check_anon_rmap(const struct folio *folio, const struct page *page, struct vm_area_struct *vma, unsigned long address) { /* * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. * * We have exclusion against folio_add_anon_rmap_*() because the caller * always holds the page locked. * * We have exclusion against folio_add_new_anon_rmap because those pages * are initially only visible via the pagetables, and the pte is locked * over the call to folio_add_new_anon_rmap. */ VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, folio); VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address), page); } static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, unsigned long address, rmap_t flags, enum pgtable_level level) { int i; VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); __folio_add_rmap(folio, page, nr_pages, vma, level); if (likely(!folio_test_ksm(folio))) __page_check_anon_rmap(folio, page, vma, address); if (flags & RMAP_EXCLUSIVE) { switch (level) { case PGTABLE_LEVEL_PTE: for (i = 0; i < nr_pages; i++) SetPageAnonExclusive(page + i); break; case PGTABLE_LEVEL_PMD: SetPageAnonExclusive(page); break; case PGTABLE_LEVEL_PUD: /* * Keep the compiler happy, we don't support anonymous * PUD mappings. */ WARN_ON_ONCE(1); break; default: BUILD_BUG(); } } VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) && atomic_read(&folio->_mapcount) > 0, folio); for (i = 0; i < nr_pages; i++) { struct page *cur_page = page + i; VM_WARN_ON_FOLIO(folio_test_large(folio) && folio_entire_mapcount(folio) > 1 && PageAnonExclusive(cur_page), folio); if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) continue; /* * While PTE-mapping a THP we have a PMD and a PTE * mapping. */ VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 && PageAnonExclusive(cur_page), folio); } /* * Only mlock it if the folio is fully mapped to the VMA. * * Partially mapped folios can be split on reclaim and part outside * of mlocked VMA can be evicted or freed. */ if (folio_nr_pages(folio) == nr_pages) mlock_vma_folio(folio, vma); } /** * folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio * @folio: The folio to add the mappings to * @page: The first page to add * @nr_pages: The number of pages which will be mapped * @vma: The vm area in which the mappings are added * @address: The user virtual address of the first page to map * @flags: The rmap flags * * The page range of folio is defined by [first_page, first_page + nr_pages) * * The caller needs to hold the page table lock, and the page must be locked in * the anon_vma case: to serialize mapping,index checking after setting, * and to ensure that an anon folio is not being upgraded racily to a KSM folio * (but KSM folios are never downgraded). */ void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags, PGTABLE_LEVEL_PTE); } /** * folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio * @folio: The folio to add the mapping to * @page: The first page to add * @vma: The vm area in which the mapping is added * @address: The user virtual address of the first page to map * @flags: The rmap flags * * The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock, and the page must be locked in * the anon_vma case: to serialize mapping,index checking after setting. */ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif } /** * folio_add_new_anon_rmap - Add mapping to a new anonymous folio. * @folio: The folio to add the mapping to. * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * @flags: The rmap flags * * Like folio_add_anon_rmap_*() but must only be called on *new* folios. * This means the inc-and-test can be bypassed. * The folio doesn't necessarily need to be locked while it's exclusive * unless two threads map it concurrently. However, the folio must be * locked if it's shared. * * If the folio is pmd-mappable, it is accounted as a THP. */ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { const bool exclusive = flags & RMAP_EXCLUSIVE; int nr = 1, nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio); /* * VM_DROPPABLE mappings don't swap; instead they're just dropped when * under memory pressure. */ if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE)) __folio_set_swapbacked(folio); __folio_set_anon(folio, vma, address, exclusive); if (likely(!folio_test_large(folio))) { /* increment count (starts at -1) */ atomic_set(&folio->_mapcount, 0); if (exclusive) SetPageAnonExclusive(&folio->page); } else if (!folio_test_pmd_mappable(folio)) { int i; nr = folio_large_nr_pages(folio); for (i = 0; i < nr; i++) { struct page *page = folio_page(folio, i); if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); if (exclusive) SetPageAnonExclusive(page); } folio_set_large_mapcount(folio, nr, vma); if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) atomic_set(&folio->_nr_pages_mapped, nr); } else { nr = folio_large_nr_pages(folio); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); folio_set_large_mapcount(folio, 1, vma); if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); if (exclusive) SetPageAnonExclusive(&folio->page); nr_pmdmapped = nr; } VM_WARN_ON_ONCE(address < vma->vm_start || address + (nr << PAGE_SHIFT) > vma->vm_end); __folio_mod_stat(folio, nr, nr_pmdmapped); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); } static __always_inline void __folio_add_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum pgtable_level level) { VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); __folio_add_rmap(folio, page, nr_pages, vma, level); /* * Only mlock it if the folio is fully mapped to the VMA. * * Partially mapped folios can be split on reclaim and part outside * of mlocked VMA can be evicted or freed. */ if (folio_nr_pages(folio) == nr_pages) mlock_vma_folio(folio, vma); } /** * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio * @folio: The folio to add the mappings to * @page: The first page to add * @nr_pages: The number of pages that will be mapped using PTEs * @vma: The vm area in which the mappings are added * * The page range of the folio is defined by [page, page + nr_pages) * * The caller needs to hold the page table lock. */ void folio_add_file_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { __folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio * @folio: The folio to add the mapping to * @page: The first page to add * @vma: The vm area in which the mapping is added * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock. */ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif } /** * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio * @folio: The folio to add the mapping to * @page: The first page to add * @vma: The vm area in which the mapping is added * * The page range of the folio is defined by [page, page + HPAGE_PUD_NR) * * The caller needs to hold the page table lock. */ void folio_add_file_rmap_pud(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif } static __always_inline void __folio_remove_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; int last = 0, nr = 0, nr_pmdmapped = 0; bool partially_mapped = false; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_add_negative(-1, &folio->_mapcount); break; } if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { nr = folio_sub_return_large_mapcount(folio, nr_pages, vma); if (!nr) { /* Now completely unmapped. */ nr = folio_large_nr_pages(folio); } else { partially_mapped = nr < folio_large_nr_pages(folio) && !folio_entire_mapcount(folio); nr = 0; } break; } folio_sub_large_mapcount(folio, nr_pages, vma); do { last += atomic_add_negative(-1, &page->_mapcount); } while (page++, --nr_pages > 0); if (last && atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED) nr = last; partially_mapped = nr && atomic_read(mapped); break; case PGTABLE_LEVEL_PMD: case PGTABLE_LEVEL_PUD: if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { last = atomic_add_negative(-1, &folio->_entire_mapcount); if (level == PGTABLE_LEVEL_PMD && last) nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_dec_return_large_mapcount(folio, vma); if (!nr) { /* Now completely unmapped. */ nr = folio_large_nr_pages(folio); } else { partially_mapped = last && nr < folio_large_nr_pages(folio); nr = 0; } break; } folio_dec_large_mapcount(folio, vma); last = atomic_add_negative(-1, &folio->_entire_mapcount); if (last) { nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED)) { nr_pages = folio_large_nr_pages(folio); if (level == PGTABLE_LEVEL_PMD) nr_pmdmapped = nr_pages; nr = nr_pages - nr; /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) nr = 0; } else { /* An add of ENTIRELY_MAPPED raced ahead */ nr = 0; } } partially_mapped = nr && nr < nr_pmdmapped; break; default: BUILD_BUG(); } /* * Queue anon large folio for deferred split if at least one page of * the folio is unmapped and at least one page is still mapped. * * Check partially_mapped first to ensure it is a large folio. * * Device private folios do not support deferred splitting and * shrinker based scanning of the folios to free. */ if (partially_mapped && folio_test_anon(folio) && !folio_test_partially_mapped(folio) && !folio_is_device_private(folio)) deferred_split_folio(folio, true); __folio_mod_stat(folio, -nr, -nr_pmdmapped); /* * It would be tidy to reset folio_test_anon mapping when fully * unmapped, but that might overwrite a racing folio_add_anon_rmap_*() * which increments mapcount after us but sets mapping before us: * so leave the reset to free_pages_prepare, and remember that * it's only reliable while mapped. */ munlock_vma_folio(folio, vma); } /** * folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio * @folio: The folio to remove the mappings from * @page: The first page to remove * @nr_pages: The number of pages that will be removed from the mapping * @vma: The vm area from which the mappings are removed * * The page range of the folio is defined by [page, page + nr_pages) * * The caller needs to hold the page table lock. */ void folio_remove_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { __folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** * folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio * @folio: The folio to remove the mapping from * @page: The first page to remove * @vma: The vm area from which the mapping is removed * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock. */ void folio_remove_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif } /** * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio * @folio: The folio to remove the mapping from * @page: The first page to remove * @vma: The vm area from which the mapping is removed * * The page range of the folio is defined by [page, page + HPAGE_PUD_NR) * * The caller needs to hold the page table lock. */ void folio_remove_rmap_pud(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif } static inline unsigned int folio_unmap_pte_batch(struct folio *folio, struct page_vma_mapped_walk *pvmw, enum ttu_flags flags, pte_t pte) { unsigned long end_addr, addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; unsigned int max_nr; if (flags & TTU_HWPOISON) return 1; if (!folio_test_large(folio)) return 1; /* We may only batch within a single VMA and a single page table. */ end_addr = pmd_addr_end(addr, vma->vm_end); max_nr = (end_addr - addr) >> PAGE_SHIFT; /* We only support lazyfree batching for now ... */ if (!folio_test_anon(folio) || folio_test_swapbacked(folio)) return 1; if (pte_unused(pte)) return 1; return folio_pte_batch(folio, pvmw->pte, pte, max_nr); } /* * @arg: enum ttu_flags will be passed to this argument */ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); bool anon_exclusive, ret = true; pte_t pteval; struct page *subpage; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; unsigned long nr_pages = 1, end_addr; unsigned long pfn; unsigned long hsz = 0; int ptes = 0; /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), * try_to_unmap() may return before page_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) pvmw.flags = PVMW_SYNC; /* * For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud * invalidation in the case of pmd sharing. * * Note that the folio can not be freed in this function as call of * try_to_unmap() must hold a reference on the folio. */ range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* * If sharing is possible, start and end will be adjusted * accordingly. */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* We need the huge page size for set_huge_pte_at() */ hsz = huge_page_size(hstate_vma(vma)); } mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { /* * If the folio is in an mlock()d vma, we must not swap it out. */ if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { ptes++; /* * Set 'ret' to indicate the page cannot be unmapped. * * Do not jump to walk_abort immediately as additional * iteration might be required to detect fully mapped * folio an mlock it. */ ret = false; /* Only mlock fully mapped pages */ if (pvmw.pte && ptes != pvmw.nr_pages) continue; /* * All PTEs must be protected by page table lock in * order to mlock the page. * * If page table boundary has been cross, current ptl * only protect part of ptes. */ if (pvmw.flags & PVMW_PGTABLE_CROSSED) goto walk_done; /* Restore the mlock which got missed */ mlock_vma_folio(folio, vma); goto walk_done; } if (!pvmw.pte) { if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio)) goto walk_done; /* * unmap_huge_pmd_locked has either already marked * the folio as swap-backed or decided to retain it * due to GUP or speculative references. */ goto walk_abort; } if (flags & TTU_SPLIT_HUGE_PMD) { /* * We temporarily have to drop the PTL and * restart so we can process the PTE-mapped THP. */ split_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, false); flags &= ~TTU_SPLIT_HUGE_PMD; page_vma_mapped_walk_restart(&pvmw); continue; } } /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); /* * Handle PFN swap PTEs, such as device-exclusive ones, that * actually map pages. */ pteval = ptep_get(pvmw.pte); if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { const softleaf_t entry = softleaf_from_pte(pteval); pfn = softleaf_to_pfn(entry); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); if (folio_test_hugetlb(folio)) { bool anon = folio_test_anon(folio); /* * The try_to_unmap() is only passed a hugetlb page * in the case where the hugetlb page is poisoned. */ VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage); /* * huge_pmd_unshare may unmap an entire PMD page. * There is no way of knowing exactly which PMDs may * be cached for this mm, so we must flush them all. * start/end were already adjusted above to cover this * range. */ flush_cache_range(vma, range.start, range.end); /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. * * We also must hold hugetlb vma_lock in write mode. * Lock order dictates acquiring vma_lock BEFORE * i_mmap_rwsem. We can only try lock here and fail * if unsuccessful. */ if (!anon) { VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) goto walk_abort; if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { hugetlb_vma_unlock_write(vma); flush_tlb_range(vma, range.start, range.end); /* * The ref count of the PMD page was * dropped which is part of the way map * counting is done for shared PMDs. * Return 'true' here. When there is * no other sharing, huge_pmd_unshare * returns false and we will unmap the * actual page and drop map count * to zero. */ goto walk_done; } hugetlb_vma_unlock_write(vma); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); if (pte_dirty(pteval)) folio_mark_dirty(folio); } else if (likely(pte_present(pteval))) { nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval); end_addr = address + nr_pages * PAGE_SIZE; flush_cache_range(vma, address, end_addr); /* Nuke the page table entry. */ pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages); /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */ if (should_defer_flush(mm, flags)) set_tlb_ubc_flush_pending(mm, pteval, address, end_addr); else flush_tlb_range(vma, address, end_addr); if (pte_dirty(pteval)) folio_mark_dirty(folio); } else { pte_clear(mm, address, pvmw.pte); } /* * Now the pte is cleared. If this pte was uffd-wp armed, * we may want to replace a none pte with a marker pte if * it's file-backed, so we don't lose the tracking info. */ pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } } else if (likely(pte_present(pteval)) && pte_unused(pteval) && !userfaultfd_armed(vma)) { /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan * will take care of the rest. * A future reference will then fault in a new zero * page. When userfaultfd is active, we must not drop * this page though, as its main user (postcopy * migration) will not expect userfaults on already * copied pages. */ dec_mm_counter(mm, mm_counter(folio)); } else if (folio_test_anon(folio)) { swp_entry_t entry = page_swap_entry(subpage); pte_t swp_pte; /* * Store the swap location in the pte. * See handle_pte_fault() ... */ if (unlikely(folio_test_swapbacked(folio) != folio_test_swapcache(folio))) { WARN_ON_ONCE(1); goto walk_abort; } /* MADV_FREE page check */ if (!folio_test_swapbacked(folio)) { int ref_count, map_count; /* * Synchronize with gup_pte_range(): * - clear PTE; barrier; read refcount * - inc refcount; barrier; read PTE */ smp_mb(); ref_count = folio_ref_count(folio); map_count = folio_mapcount(folio); /* * Order reads for page refcount and dirty flag * (see comments in __remove_mapping()). */ smp_rmb(); if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) { /* * redirtied either using the page table or a previously * obtained GUP reference. */ set_ptes(mm, address, pvmw.pte, pteval, nr_pages); folio_set_swapbacked(folio); goto walk_abort; } else if (ref_count != 1 + map_count) { /* * Additional reference. Could be a GUP reference or any * speculative reference. GUP users must mark the folio * dirty if there was a modification. This folio cannot be * reclaimed right now either way, so act just like nothing * happened. * We'll come back here later and detect if the folio was * dirtied when the additional reference is gone. */ set_ptes(mm, address, pvmw.pte, pteval, nr_pages); goto walk_abort; } add_mm_counter(mm, MM_ANONPAGES, -nr_pages); goto discard; } if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } /* * arch_unmap_one() is expected to be a NOP on * architectures where we could have PFN swap PTEs, * so we'll not check/care. */ if (arch_unmap_one(mm, vma, address, pteval) < 0) { swap_free(entry); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } /* See folio_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) { swap_free(entry); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); if (list_empty(&mm->mmlist)) list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } dec_mm_counter(mm, MM_ANONPAGES); inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); if (anon_exclusive) swp_pte = pte_swp_mkexclusive(swp_pte); if (likely(pte_present(pteval))) { if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); } else { if (pte_swp_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); } set_pte_at(mm, address, pvmw.pte, swp_pte); } else { /* * This is a locked file-backed folio, * so it cannot be removed from the page * cache and replaced by a new folio before * mmu_notifier_invalidate_range_end, so no * concurrent thread might update its page table * to point at a new folio while a device is * still using this folio. * * See Documentation/mm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(folio)); } discard: if (unlikely(folio_test_hugetlb(folio))) { hugetlb_remove_rmap(folio); } else { folio_remove_rmap_ptes(folio, subpage, nr_pages, vma); } if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); folio_put_refs(folio, nr_pages); /* * If we are sure that we batched the entire folio and cleared * all PTEs, we can just optimize and stop right here. */ if (nr_pages == folio_nr_pages(folio)) goto walk_done; continue; walk_abort: ret = false; walk_done: page_vma_mapped_walk_done(&pvmw); break; } mmu_notifier_invalidate_range_end(&range); return ret; } static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) { return vma_is_temporary_stack(vma); } static int folio_not_mapped(struct folio *folio) { return !folio_mapped(folio); } /** * try_to_unmap - Try to remove all page table mappings to a folio. * @folio: The folio to unmap. * @flags: action and flags * * Tries to remove all the page table entries which are mapping this * folio. It is the caller's responsibility to check if the folio is * still mapped if needed (use TTU_SYNC to prevent accounting races). * * Context: Caller must hold the folio lock. */ void try_to_unmap(struct folio *folio, enum ttu_flags flags) { struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)flags, .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; if (flags & TTU_RMAP_LOCKED) rmap_walk_locked(folio, &rwc); else rmap_walk(folio, &rwc); } /* * @arg: enum ttu_flags will be passed to this argument. * * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs * containing migration entries. */ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); bool anon_exclusive, writable, ret = true; pte_t pteval; struct page *subpage; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; unsigned long pfn; unsigned long hsz = 0; /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), * try_to_migrate() may return before page_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) pvmw.flags = PVMW_SYNC; /* * For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud * invalidation in the case of pmd sharing. * * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* * If sharing is possible, start and end will be adjusted * accordingly. */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* We need the huge page size for set_huge_pte_at() */ hsz = huge_page_size(hstate_vma(vma)); } mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { /* PMD-mapped THP migration entry */ if (!pvmw.pte) { __maybe_unused unsigned long pfn; __maybe_unused pmd_t pmdval; if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, true); ret = false; page_vma_mapped_walk_done(&pvmw); break; } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION pmdval = pmdp_get(pvmw.pmd); if (likely(pmd_present(pmdval))) pfn = pmd_pfn(pmdval); else pfn = softleaf_to_pfn(softleaf_from_pmd(pmdval)); subpage = folio_page(folio, pfn - folio_pfn(folio)); VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || !folio_test_pmd_mappable(folio), folio); if (set_pmd_migration_entry(&pvmw, subpage)) { ret = false; page_vma_mapped_walk_done(&pvmw); break; } continue; #endif } /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); /* * Handle PFN swap PTEs, such as device-exclusive ones, that * actually map pages. */ pteval = ptep_get(pvmw.pte); if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { const softleaf_t entry = softleaf_from_pte(pteval); pfn = softleaf_to_pfn(entry); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); if (folio_test_hugetlb(folio)) { bool anon = folio_test_anon(folio); /* * huge_pmd_unshare may unmap an entire PMD page. * There is no way of knowing exactly which PMDs may * be cached for this mm, so we must flush them all. * start/end were already adjusted above to cover this * range. */ flush_cache_range(vma, range.start, range.end); /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. * * We also must hold hugetlb vma_lock in write mode. * Lock order dictates acquiring vma_lock BEFORE * i_mmap_rwsem. We can only try lock here and * fail if unsuccessful. */ if (!anon) { VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) { page_vma_mapped_walk_done(&pvmw); ret = false; break; } if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { hugetlb_vma_unlock_write(vma); flush_tlb_range(vma, range.start, range.end); /* * The ref count of the PMD page was * dropped which is part of the way map * counting is done for shared PMDs. * Return 'true' here. When there is * no other sharing, huge_pmd_unshare * returns false and we will unmap the * actual page and drop map count * to zero. */ page_vma_mapped_walk_done(&pvmw); break; } hugetlb_vma_unlock_write(vma); } /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); if (pte_dirty(pteval)) folio_mark_dirty(folio); writable = pte_write(pteval); } else if (likely(pte_present(pteval))) { flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } if (pte_dirty(pteval)) folio_mark_dirty(folio); writable = pte_write(pteval); } else { const softleaf_t entry = softleaf_from_pte(pteval); pte_clear(mm, address, pvmw.pte); writable = softleaf_is_device_private_write(entry); } VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) && !anon_exclusive, folio); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); if (PageHWPoison(subpage)) { VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio); pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } } else if (likely(pte_present(pteval)) && pte_unused(pteval) && !userfaultfd_armed(vma)) { /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan * will take care of the rest. * A future reference will then fault in a new zero * page. When userfaultfd is active, we must not drop * this page though, as its main user (postcopy * migration) will not expect userfaults on already * copied pages. */ dec_mm_counter(mm, mm_counter(folio)); } else { swp_entry_t entry; pte_t swp_pte; /* * arch_unmap_one() is expected to be a NOP on * architectures where we could have PFN swap PTEs, * so we'll not check/care. */ if (arch_unmap_one(mm, vma, address, pteval) < 0) { if (folio_test_hugetlb(folio)) set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); else set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (folio_test_hugetlb(folio)) { if (anon_exclusive && hugetlb_try_share_anon_rmap(folio)) { set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); ret = false; page_vma_mapped_walk_done(&pvmw); break; } } else if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) { set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ if (writable) entry = make_writable_migration_entry( page_to_pfn(subpage)); else if (anon_exclusive) entry = make_readable_exclusive_migration_entry( page_to_pfn(subpage)); else entry = make_readable_migration_entry( page_to_pfn(subpage)); if (likely(pte_present(pteval))) { if (pte_young(pteval)) entry = make_migration_entry_young(entry); if (pte_dirty(pteval)) entry = make_migration_entry_dirty(entry); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); } else { swp_pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); } if (folio_test_hugetlb(folio)) set_huge_pte_at(mm, address, pvmw.pte, swp_pte, hsz); else set_pte_at(mm, address, pvmw.pte, swp_pte); trace_set_migration_pte(address, pte_val(swp_pte), folio_order(folio)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. */ } if (unlikely(folio_test_hugetlb(folio))) hugetlb_remove_rmap(folio); else folio_remove_rmap_pte(folio, subpage, vma); if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); folio_put(folio); } mmu_notifier_invalidate_range_end(&range); return ret; } /** * try_to_migrate - try to replace all page table mappings with swap entries * @folio: the folio to replace page table entries for * @flags: action and flags * * Tries to remove all the page table entries which are mapping this folio and * replace them with special swap entries. Caller must hold the folio lock. */ void try_to_migrate(struct folio *folio, enum ttu_flags flags) { struct rmap_walk_control rwc = { .rmap_one = try_to_migrate_one, .arg = (void *)flags, .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; /* * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags. */ if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC | TTU_BATCH_FLUSH))) return; if (folio_is_zone_device(folio) && (!folio_is_device_private(folio) && !folio_is_device_coherent(folio))) return; /* * During exec, a temporary VMA is setup and later moved. * The VMA is moved under the anon_vma lock but not the * page tables leading to a race where migration cannot * find the migration ptes. Rather than increasing the * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ if (!folio_test_ksm(folio) && folio_test_anon(folio)) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) rmap_walk_locked(folio, &rwc); else rmap_walk(folio, &rwc); } #ifdef CONFIG_DEVICE_PRIVATE /** * make_device_exclusive() - Mark a page for exclusive use by a device * @mm: mm_struct of associated target process * @addr: the virtual address to mark for exclusive device access * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering * @foliop: folio pointer will be stored here on success. * * This function looks up the page mapped at the given address, grabs a * folio reference, locks the folio and replaces the PTE with special * device-exclusive PFN swap entry, preventing access through the process * page tables. The function will return with the folio locked and referenced. * * On fault, the device-exclusive entries are replaced with the original PTE * under folio lock, after calling MMU notifiers. * * Only anonymous non-hugetlb folios are supported and the VMA must have * write permissions such that we can fault in the anonymous page writable * in order to mark it exclusive. The caller must hold the mmap_lock in read * mode. * * A driver using this to program access from a device must use a mmu notifier * critical section to hold a device specific lock during programming. Once * programming is complete it should drop the folio lock and reference after * which point CPU access to the page will revoke the exclusive access. * * Notes: * #. This function always operates on individual PTEs mapping individual * pages. PMD-sized THPs are first remapped to be mapped by PTEs before * the conversion happens on a single PTE corresponding to @addr. * #. While concurrent access through the process page tables is prevented, * concurrent access through other page references (e.g., earlier GUP * invocation) is not handled and not supported. * #. device-exclusive entries are considered "clean" and "old" by core-mm. * Device drivers must update the folio state when informed by MMU * notifiers. * * Returns: pointer to mapped page on success, otherwise a negative error. */ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, void *owner, struct folio **foliop) { struct mmu_notifier_range range; struct folio *folio, *fw_folio; struct vm_area_struct *vma; struct folio_walk fw; struct page *page; swp_entry_t entry; pte_t swp_pte; int ret; mmap_assert_locked(mm); addr = PAGE_ALIGN_DOWN(addr); /* * Fault in the page writable and try to lock it; note that if the * address would already be marked for exclusive use by a device, * the GUP call would undo that first by triggering a fault. * * If any other device would already map this page exclusively, the * fault will trigger a conversion to an ordinary * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE. */ retry: page = get_user_page_vma_remote(mm, addr, FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, &vma); if (IS_ERR(page)) return page; folio = page_folio(page); if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) { folio_put(folio); return ERR_PTR(-EOPNOTSUPP); } ret = folio_lock_killable(folio); if (ret) { folio_put(folio); return ERR_PTR(ret); } /* * Inform secondary MMUs that we are going to convert this PTE to * device-exclusive, such that they unmap it now. Note that the * caller must filter this event out to prevent livelocks. */ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, mm, addr, addr + PAGE_SIZE, owner); mmu_notifier_invalidate_range_start(&range); /* * Let's do a second walk and make sure we still find the same page * mapped writable. Note that any page of an anonymous folio can * only be mapped writable using exactly one PTE ("exclusive"), so * there cannot be other mappings. */ fw_folio = folio_walk_start(&fw, vma, addr, 0); if (fw_folio != folio || fw.page != page || fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) { if (fw_folio) folio_walk_end(&fw, vma); mmu_notifier_invalidate_range_end(&range); folio_unlock(folio); folio_put(folio); goto retry; } /* Nuke the page table entry so we get the uptodate dirty bit. */ flush_cache_page(vma, addr, page_to_pfn(page)); fw.pte = ptep_clear_flush(vma, addr, fw.ptep); /* Set the dirty flag on the folio now the PTE is gone. */ if (pte_dirty(fw.pte)) folio_mark_dirty(folio); /* * Store the pfn of the page in a special device-exclusive PFN swap PTE. * do_swap_page() will trigger the conversion back while holding the * folio lock. */ entry = make_device_exclusive_entry(page_to_pfn(page)); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(fw.pte)) swp_pte = pte_swp_mksoft_dirty(swp_pte); /* The pte is writable, uffd-wp does not apply. */ set_pte_at(mm, addr, fw.ptep, swp_pte); folio_walk_end(&fw, vma); mmu_notifier_invalidate_range_end(&range); *foliop = folio; return page; } EXPORT_SYMBOL_GPL(make_device_exclusive); #endif void __put_anon_vma(struct anon_vma *anon_vma) { struct anon_vma *root = anon_vma->root; anon_vma_free(anon_vma); if (root != anon_vma && atomic_dec_and_test(&root->refcount)) anon_vma_free(root); } static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; if (rwc->anon_lock) return rwc->anon_lock(folio, rwc); /* * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages * are holding mmap_lock. Users without mmap_lock are required to * take a reference count to prevent the anon_vma disappearing */ anon_vma = folio_anon_vma(folio); if (!anon_vma) return NULL; if (anon_vma_trylock_read(anon_vma)) goto out; if (rwc->try_lock) { anon_vma = NULL; rwc->contended = true; goto out; } anon_vma_lock_read(anon_vma); out: return anon_vma; } /* * rmap_walk_anon - do something to anonymous page using the object-based * rmap method * @folio: the folio to be handled * @rwc: control variable according to each walk type * @locked: caller holds relevant rmap lock * * Find all the mappings of a folio using the mapping pointer and the vma * chains contained in the anon_vma struct it points to. */ static void rmap_walk_anon(struct folio *folio, struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; /* * The folio lock ensures that folio->mapping can't be changed under us * to an anon_vma with different root. */ VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); if (locked) { anon_vma = folio_anon_vma(folio); /* anon_vma disappear under us? */ VM_BUG_ON_FOLIO(!anon_vma, folio); } else { anon_vma = rmap_walk_anon_lock(folio, rwc); } if (!anon_vma) return; pgoff_start = folio_pgoff(folio); pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(vma, pgoff_start, folio_nr_pages(folio)); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, address, rwc->arg)) break; if (rwc->done && rwc->done(folio)) break; } if (!locked) anon_vma_unlock_read(anon_vma); } /** * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping * of a page mapped within a specified page cache object at a specified offset. * * @folio: Either the folio whose mappings to traverse, or if NULL, * the callbacks specified in @rwc will be configured such * as to be able to look up mappings correctly. * @mapping: The page cache object whose mapping VMAs we intend to * traverse. If @folio is non-NULL, this should be equal to * folio_mapping(folio). * @pgoff_start: The offset within @mapping of the page which we are * looking up. If @folio is non-NULL, this should be equal * to folio_pgoff(folio). * @nr_pages: The number of pages mapped by the mapping. If @folio is * non-NULL, this should be equal to folio_nr_pages(folio). * @rwc: The reverse mapping walk control object describing how * the traversal should proceed. * @locked: Is the @mapping already locked? If not, we acquire the * lock. */ static void __rmap_walk_file(struct folio *folio, struct address_space *mapping, pgoff_t pgoff_start, unsigned long nr_pages, struct rmap_walk_control *rwc, bool locked) { pgoff_t pgoff_end = pgoff_start + nr_pages - 1; struct vm_area_struct *vma; VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio); VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio); VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio); if (!locked) { if (i_mmap_trylock_read(mapping)) goto lookup; if (rwc->try_lock) { rwc->contended = true; return; } i_mmap_lock_read(mapping); } lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { unsigned long address = vma_address(vma, pgoff_start, nr_pages); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, address, rwc->arg)) goto done; if (rwc->done && rwc->done(folio)) goto done; } done: if (!locked) i_mmap_unlock_read(mapping); } /* * rmap_walk_file - do something to file page using the object-based rmap method * @folio: the folio to be handled * @rwc: control variable according to each walk type * @locked: caller holds relevant rmap lock * * Find all the mappings of a folio using the mapping pointer and the vma chains * contained in the address_space struct it points to. */ static void rmap_walk_file(struct folio *folio, struct rmap_walk_control *rwc, bool locked) { /* * The folio lock not only makes sure that folio->mapping cannot * suddenly be NULLified by truncation, it makes sure that the structure * at mapping cannot be freed and reused yet, so we can safely take * mapping->i_mmap_rwsem. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!folio->mapping) return; __rmap_walk_file(folio, folio->mapping, folio->index, folio_nr_pages(folio), rwc, locked); } void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) { if (unlikely(folio_test_ksm(folio))) rmap_walk_ksm(folio, rwc); else if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, false); else rmap_walk_file(folio, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) { /* no ksm support for now */ VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, true); else rmap_walk_file(folio, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE /* * The following two functions are for anonymous (private mapped) hugepages. * Unlike common anonymous pages, anonymous hugepages have no accounting code * and no lru code, because we handle hugepages differently from common pages. */ void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); atomic_inc(&folio->_entire_mapcount); atomic_inc(&folio->_large_mapcount); if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(&folio->page); VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 && PageAnonExclusive(&folio->page), folio); } void hugetlb_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_large_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); __folio_set_anon(folio, vma, address, true); SetPageAnonExclusive(&folio->page); } #endif /* CONFIG_HUGETLB_PAGE */
3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 // SPDX-License-Identifier: GPL-2.0-only /* * vivid-vbi-cap.c - vbi capture support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/videodev2.h> #include <media/v4l2-common.h> #include "vivid-core.h" #include "vivid-kthread-cap.h" #include "vivid-vbi-cap.h" #include "vivid-vbi-gen.h" #include "vivid-vid-common.h" static void vivid_sliced_vbi_cap_fill(struct vivid_dev *dev, unsigned seqnr) { struct vivid_vbi_gen_data *vbi_gen = &dev->vbi_gen; bool is_60hz = dev->std_cap[dev->input] & V4L2_STD_525_60; vivid_vbi_gen_sliced(vbi_gen, is_60hz, seqnr); if (!is_60hz) { if (vivid_vid_can_loop(dev)) { if (dev->vbi_out_have_wss) { vbi_gen->data[12].data[0] = dev->vbi_out_wss[0]; vbi_gen->data[12].data[1] = dev->vbi_out_wss[1]; } else { vbi_gen->data[12].id = 0; } } else { switch (tpg_g_video_aspect(&dev->tpg)) { case TPG_VIDEO_ASPECT_14X9_CENTRE: vbi_gen->data[12].data[0] = 0x01; break; case TPG_VIDEO_ASPECT_16X9_CENTRE: vbi_gen->data[12].data[0] = 0x0b; break; case TPG_VIDEO_ASPECT_16X9_ANAMORPHIC: vbi_gen->data[12].data[0] = 0x07; break; case TPG_VIDEO_ASPECT_4X3: default: vbi_gen->data[12].data[0] = 0x08; break; } } } else if (vivid_vid_can_loop(dev) && is_60hz) { if (dev->vbi_out_have_cc[0]) { vbi_gen->data[0].data[0] = dev->vbi_out_cc[0][0]; vbi_gen->data[0].data[1] = dev->vbi_out_cc[0][1]; } else { vbi_gen->data[0].id = 0; } if (dev->vbi_out_have_cc[1]) { vbi_gen->data[1].data[0] = dev->vbi_out_cc[1][0]; vbi_gen->data[1].data[1] = dev->vbi_out_cc[1][1]; } else { vbi_gen->data[1].id = 0; } } } static void vivid_g_fmt_vbi_cap(struct vivid_dev *dev, struct v4l2_vbi_format *vbi) { bool is_60hz = dev->std_cap[dev->input] & V4L2_STD_525_60; vbi->sampling_rate = 27000000; vbi->offset = 24; vbi->samples_per_line = 1440; vbi->sample_format = V4L2_PIX_FMT_GREY; vbi->start[0] = is_60hz ? V4L2_VBI_ITU_525_F1_START + 9 : V4L2_VBI_ITU_625_F1_START + 5; vbi->start[1] = is_60hz ? V4L2_VBI_ITU_525_F2_START + 9 : V4L2_VBI_ITU_625_F2_START + 5; vbi->count[0] = vbi->count[1] = is_60hz ? 12 : 18; vbi->flags = dev->vbi_cap_interlaced ? V4L2_VBI_INTERLACED : 0; vbi->reserved[0] = 0; vbi->reserved[1] = 0; } void vivid_raw_vbi_cap_process(struct vivid_dev *dev, struct vivid_buffer *buf) { struct v4l2_vbi_format vbi; u8 *vbuf = vb2_plane_vaddr(&buf->vb.vb2_buf, 0); vivid_g_fmt_vbi_cap(dev, &vbi); buf->vb.sequence = dev->vbi_cap_seq_count; if (dev->field_cap == V4L2_FIELD_ALTERNATE) buf->vb.sequence /= 2; vivid_sliced_vbi_cap_fill(dev, buf->vb.sequence); memset(vbuf, 0x10, vb2_plane_size(&buf->vb.vb2_buf, 0)); if (!VIVID_INVALID_SIGNAL(dev->std_signal_mode[dev->input])) vivid_vbi_gen_raw(&dev->vbi_gen, &vbi, vbuf); } void vivid_sliced_vbi_cap_process(struct vivid_dev *dev, struct vivid_buffer *buf) { struct v4l2_sliced_vbi_data *vbuf = vb2_plane_vaddr(&buf->vb.vb2_buf, 0); buf->vb.sequence = dev->vbi_cap_seq_count; if (dev->field_cap == V4L2_FIELD_ALTERNATE) buf->vb.sequence /= 2; vivid_sliced_vbi_cap_fill(dev, buf->vb.sequence); memset(vbuf, 0, vb2_plane_size(&buf->vb.vb2_buf, 0)); if (!VIVID_INVALID_SIGNAL(dev->std_signal_mode[dev->input])) { unsigned i; for (i = 0; i < 25; i++) vbuf[i] = dev->vbi_gen.data[i]; } } static int vbi_cap_queue_setup(struct vb2_queue *vq, unsigned *nbuffers, unsigned *nplanes, unsigned sizes[], struct device *alloc_devs[]) { struct vivid_dev *dev = vb2_get_drv_priv(vq); bool is_60hz = dev->std_cap[dev->input] & V4L2_STD_525_60; unsigned size = vq->type == V4L2_BUF_TYPE_SLICED_VBI_CAPTURE ? 36 * sizeof(struct v4l2_sliced_vbi_data) : 1440 * 2 * (is_60hz ? 12 : 18); if (!vivid_is_sdtv_cap(dev)) return -EINVAL; if (*nplanes) return sizes[0] < size ? -EINVAL : 0; sizes[0] = size; *nplanes = 1; return 0; } static int vbi_cap_buf_prepare(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); bool is_60hz = dev->std_cap[dev->input] & V4L2_STD_525_60; unsigned size = vb->vb2_queue->type == V4L2_BUF_TYPE_SLICED_VBI_CAPTURE ? 36 * sizeof(struct v4l2_sliced_vbi_data) : 1440 * 2 * (is_60hz ? 12 : 18); dprintk(dev, 1, "%s\n", __func__); if (dev->buf_prepare_error) { /* * Error injection: test what happens if buf_prepare() returns * an error. */ dev->buf_prepare_error = false; return -EINVAL; } if (vb2_plane_size(vb, 0) < size) { dprintk(dev, 1, "%s data will not fit into plane (%lu < %u)\n", __func__, vb2_plane_size(vb, 0), size); return -EINVAL; } vb2_set_plane_payload(vb, 0, size); return 0; } static void vbi_cap_buf_queue(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); struct vivid_buffer *buf = container_of(vbuf, struct vivid_buffer, vb); dprintk(dev, 1, "%s\n", __func__); spin_lock(&dev->slock); list_add_tail(&buf->list, &dev->vbi_cap_active); spin_unlock(&dev->slock); } static int vbi_cap_start_streaming(struct vb2_queue *vq, unsigned count) { struct vivid_dev *dev = vb2_get_drv_priv(vq); int err; dprintk(dev, 1, "%s\n", __func__); dev->vbi_cap_seq_count = 0; if (dev->start_streaming_error) { dev->start_streaming_error = false; err = -EINVAL; } else { err = vivid_start_generating_vid_cap(dev, &dev->vbi_cap_streaming); } if (err) { struct vivid_buffer *buf, *tmp; list_for_each_entry_safe(buf, tmp, &dev->vbi_cap_active, list) { list_del(&buf->list); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_QUEUED); } } return err; } /* abort streaming and wait for last buffer */ static void vbi_cap_stop_streaming(struct vb2_queue *vq) { struct vivid_dev *dev = vb2_get_drv_priv(vq); dprintk(dev, 1, "%s\n", __func__); vivid_stop_generating_vid_cap(dev, &dev->vbi_cap_streaming); } static void vbi_cap_buf_request_complete(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); v4l2_ctrl_request_complete(vb->req_obj.req, &dev->ctrl_hdl_vbi_cap); } const struct vb2_ops vivid_vbi_cap_qops = { .queue_setup = vbi_cap_queue_setup, .buf_prepare = vbi_cap_buf_prepare, .buf_queue = vbi_cap_buf_queue, .start_streaming = vbi_cap_start_streaming, .stop_streaming = vbi_cap_stop_streaming, .buf_request_complete = vbi_cap_buf_request_complete, }; int vidioc_g_fmt_vbi_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_vbi_format *vbi = &f->fmt.vbi; if (!vivid_is_sdtv_cap(dev) || !dev->has_raw_vbi_cap) return -EINVAL; vivid_g_fmt_vbi_cap(dev, vbi); return 0; } int vidioc_s_fmt_vbi_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); int ret = vidioc_g_fmt_vbi_cap(file, priv, f); if (ret) return ret; if (f->type != V4L2_BUF_TYPE_VBI_CAPTURE && vb2_is_busy(&dev->vb_vbi_cap_q)) return -EBUSY; return 0; } void vivid_fill_service_lines(struct v4l2_sliced_vbi_format *vbi, u32 service_set) { vbi->io_size = sizeof(struct v4l2_sliced_vbi_data) * 36; vbi->service_set = service_set; memset(vbi->service_lines, 0, sizeof(vbi->service_lines)); memset(vbi->reserved, 0, sizeof(vbi->reserved)); if (vbi->service_set == 0) return; if (vbi->service_set & V4L2_SLICED_CAPTION_525) { vbi->service_lines[0][21] = V4L2_SLICED_CAPTION_525; vbi->service_lines[1][21] = V4L2_SLICED_CAPTION_525; } if (vbi->service_set & V4L2_SLICED_WSS_625) { unsigned i; for (i = 7; i <= 18; i++) vbi->service_lines[0][i] = vbi->service_lines[1][i] = V4L2_SLICED_TELETEXT_B; vbi->service_lines[0][23] = V4L2_SLICED_WSS_625; } } int vidioc_g_fmt_sliced_vbi_cap(struct file *file, void *priv, struct v4l2_format *fmt) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_sliced_vbi_format *vbi = &fmt->fmt.sliced; if (!vivid_is_sdtv_cap(dev) || !dev->has_sliced_vbi_cap) return -EINVAL; vivid_fill_service_lines(vbi, dev->service_set_cap); return 0; } int vidioc_try_fmt_sliced_vbi_cap(struct file *file, void *priv, struct v4l2_format *fmt) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_sliced_vbi_format *vbi = &fmt->fmt.sliced; bool is_60hz = dev->std_cap[dev->input] & V4L2_STD_525_60; u32 service_set = vbi->service_set; if (!vivid_is_sdtv_cap(dev) || !dev->has_sliced_vbi_cap) return -EINVAL; service_set &= is_60hz ? V4L2_SLICED_CAPTION_525 : V4L2_SLICED_WSS_625 | V4L2_SLICED_TELETEXT_B; vivid_fill_service_lines(vbi, service_set); return 0; } int vidioc_s_fmt_sliced_vbi_cap(struct file *file, void *priv, struct v4l2_format *fmt) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_sliced_vbi_format *vbi = &fmt->fmt.sliced; int ret = vidioc_try_fmt_sliced_vbi_cap(file, priv, fmt); if (ret) return ret; if (fmt->type != V4L2_BUF_TYPE_SLICED_VBI_CAPTURE && vb2_is_busy(&dev->vb_vbi_cap_q)) return -EBUSY; dev->service_set_cap = vbi->service_set; return 0; } int vidioc_g_sliced_vbi_cap(struct file *file, void *priv, struct v4l2_sliced_vbi_cap *cap) { struct vivid_dev *dev = video_drvdata(file); struct video_device *vdev = video_devdata(file); bool is_60hz; if (vdev->vfl_dir == VFL_DIR_RX) { is_60hz = dev->std_cap[dev->input] & V4L2_STD_525_60; if (!vivid_is_sdtv_cap(dev) || !dev->has_sliced_vbi_cap || cap->type != V4L2_BUF_TYPE_SLICED_VBI_CAPTURE) return -EINVAL; } else { is_60hz = dev->std_out & V4L2_STD_525_60; if (!vivid_is_svid_out(dev) || !dev->has_sliced_vbi_out || cap->type != V4L2_BUF_TYPE_SLICED_VBI_OUTPUT) return -EINVAL; } cap->service_set = is_60hz ? V4L2_SLICED_CAPTION_525 : V4L2_SLICED_WSS_625 | V4L2_SLICED_TELETEXT_B; if (is_60hz) { cap->service_lines[0][21] = V4L2_SLICED_CAPTION_525; cap->service_lines[1][21] = V4L2_SLICED_CAPTION_525; } else { unsigned i; for (i = 7; i <= 18; i++) cap->service_lines[0][i] = cap->service_lines[1][i] = V4L2_SLICED_TELETEXT_B; cap->service_lines[0][23] = V4L2_SLICED_WSS_625; } return 0; }
55 3 55 55 55 1 55 55 55 26 26 8 6 2 1 5 3 1 1 3 5 23 5 4 5 4 2 4 1 3 2 23 23 23 7 12 2 2 1 1 1 1 1 1 10 4 4 4 60 48 60 12 60 40 32 3 1 22 37 37 37 35 7 6 5 5 5 5 5 5 4 2 2 2 2 3 15 15 13 13 8 1 1 3 2 2 2 3 7 2 1 1 1 4 4 4 3 2 2 9 3 4 2 1 2 2 3 3 20 60 19 2 2 2 2 17 17 1 12 1 16 12 2 14 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The options processing module for ip.c * * Authors: A.N.Kuznetsov * */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/uaccess.h> #include <linux/unaligned.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/cipso_ipv4.h> #include <net/ip_fib.h> /* * Write options to IP header, record destination address to * source route option, address of outgoing interface * (we should already know it, so that this function is allowed be * called only after routing decision) and timestamp, * if we originate this datagram. * * daddr is real destination address, next hop is recorded in IP header. * saddr is address of outgoing interface. */ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt) { unsigned char *iph = skb_network_header(skb); memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); memcpy(iph + sizeof(struct iphdr), opt->__data, opt->optlen); opt = &(IPCB(skb)->opt); if (opt->srr) memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4); if (opt->rr_needaddr) ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt); if (opt->ts_needaddr) ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt); if (opt->ts_needtime) { __be32 midtime; midtime = inet_current_timestamp(); memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4); } } /* * Provided (sopt, skb) points to received options, * build in dopt compiled option set appropriate for answering. * i.e. invert SRR option, copy anothers, * and grab room in RR/TS options. * * NOTE: dopt cannot point to skb. */ int __ip_options_echo(struct net *net, struct ip_options *dopt, struct sk_buff *skb, const struct ip_options *sopt) { unsigned char *sptr, *dptr; int soffset, doffset; int optlen; memset(dopt, 0, sizeof(struct ip_options)); if (sopt->optlen == 0) return 0; sptr = skb_network_header(skb); dptr = dopt->__data; if (sopt->rr) { optlen = sptr[sopt->rr+1]; soffset = sptr[sopt->rr+2]; dopt->rr = dopt->optlen + sizeof(struct iphdr); memcpy(dptr, sptr+sopt->rr, optlen); if (sopt->rr_needaddr && soffset <= optlen) { if (soffset + 3 > optlen) return -EINVAL; dptr[2] = soffset + 4; dopt->rr_needaddr = 1; } dptr += optlen; dopt->optlen += optlen; } if (sopt->ts) { optlen = sptr[sopt->ts+1]; soffset = sptr[sopt->ts+2]; dopt->ts = dopt->optlen + sizeof(struct iphdr); memcpy(dptr, sptr+sopt->ts, optlen); if (soffset <= optlen) { if (sopt->ts_needaddr) { if (soffset + 3 > optlen) return -EINVAL; dopt->ts_needaddr = 1; soffset += 4; } if (sopt->ts_needtime) { if (soffset + 3 > optlen) return -EINVAL; if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) { dopt->ts_needtime = 1; soffset += 4; } else { dopt->ts_needtime = 0; if (soffset + 7 <= optlen) { __be32 addr; memcpy(&addr, dptr+soffset-1, 4); if (inet_addr_type(net, addr) != RTN_UNICAST) { dopt->ts_needtime = 1; soffset += 8; } } } } dptr[2] = soffset; } dptr += optlen; dopt->optlen += optlen; } if (sopt->srr) { unsigned char *start = sptr+sopt->srr; __be32 faddr; optlen = start[1]; soffset = start[2]; doffset = 0; if (soffset > optlen) soffset = optlen + 1; soffset -= 4; if (soffset > 3) { memcpy(&faddr, &start[soffset-1], 4); for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4) memcpy(&dptr[doffset-1], &start[soffset-1], 4); /* * RFC1812 requires to fix illegal source routes. */ if (memcmp(&ip_hdr(skb)->saddr, &start[soffset + 3], 4) == 0) doffset -= 4; } if (doffset > 3) { dopt->faddr = faddr; dptr[0] = start[0]; dptr[1] = doffset+3; dptr[2] = 4; dptr += doffset+3; dopt->srr = dopt->optlen + sizeof(struct iphdr); dopt->optlen += doffset+3; dopt->is_strictroute = sopt->is_strictroute; } } if (sopt->cipso) { optlen = sptr[sopt->cipso+1]; dopt->cipso = dopt->optlen+sizeof(struct iphdr); memcpy(dptr, sptr+sopt->cipso, optlen); dptr += optlen; dopt->optlen += optlen; } while (dopt->optlen & 3) { *dptr++ = IPOPT_END; dopt->optlen++; } return 0; } /* * Options "fragmenting", just fill options not * allowed in fragments with NOOPs. * Simple and stupid 8), but the most efficient way. */ void ip_options_fragment(struct sk_buff *skb) { unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); struct ip_options *opt = &(IPCB(skb)->opt); int l = opt->optlen; int optlen; while (l > 0) { switch (*optptr) { case IPOPT_END: return; case IPOPT_NOOP: l--; optptr++; continue; } optlen = optptr[1]; if (optlen < 2 || optlen > l) return; if (!IPOPT_COPIED(*optptr)) memset(optptr, IPOPT_NOOP, optlen); l -= optlen; optptr += optlen; } opt->ts = 0; opt->rr = 0; opt->rr_needaddr = 0; opt->ts_needaddr = 0; opt->ts_needtime = 0; } /* helper used by ip_options_compile() to call fib_compute_spec_dst() * at most one time. */ static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) { if (*spec_dst == htonl(INADDR_ANY)) *spec_dst = fib_compute_spec_dst(skb); } /* * Verify options and fill pointers in struct options. * Caller should clear *opt, and set opt->data. * If opt == NULL, then skb->data should point to IP header. */ int __ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb, __be32 *info) { __be32 spec_dst = htonl(INADDR_ANY); unsigned char *pp_ptr = NULL; struct rtable *rt = NULL; unsigned char *optptr; unsigned char *iph; int optlen, l; if (skb) { rt = skb_rtable(skb); optptr = (unsigned char *)&(ip_hdr(skb)[1]); } else optptr = opt->__data; iph = optptr - sizeof(struct iphdr); for (l = opt->optlen; l > 0; ) { switch (*optptr) { case IPOPT_END: for (optptr++, l--; l > 0; optptr++, l--) { if (*optptr != IPOPT_END) { *optptr = IPOPT_END; opt->is_changed = 1; } } goto eol; case IPOPT_NOOP: l--; optptr++; continue; } if (unlikely(l < 2)) { pp_ptr = optptr; goto error; } optlen = optptr[1]; if (optlen < 2 || optlen > l) { pp_ptr = optptr; goto error; } switch (*optptr) { case IPOPT_SSRR: case IPOPT_LSRR: if (optlen < 3) { pp_ptr = optptr + 1; goto error; } if (optptr[2] < 4) { pp_ptr = optptr + 2; goto error; } /* NB: cf RFC-1812 5.2.4.1 */ if (opt->srr) { pp_ptr = optptr; goto error; } if (!skb) { if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) { pp_ptr = optptr + 1; goto error; } memcpy(&opt->faddr, &optptr[3], 4); if (optlen > 7) memmove(&optptr[3], &optptr[7], optlen-7); } opt->is_strictroute = (optptr[0] == IPOPT_SSRR); opt->srr = optptr - iph; break; case IPOPT_RR: if (opt->rr) { pp_ptr = optptr; goto error; } if (optlen < 3) { pp_ptr = optptr + 1; goto error; } if (optptr[2] < 4) { pp_ptr = optptr + 2; goto error; } if (optptr[2] <= optlen) { if (optptr[2]+3 > optlen) { pp_ptr = optptr + 2; goto error; } if (rt) { spec_dst_fill(&spec_dst, skb); memcpy(&optptr[optptr[2]-1], &spec_dst, 4); opt->is_changed = 1; } optptr[2] += 4; opt->rr_needaddr = 1; } opt->rr = optptr - iph; break; case IPOPT_TIMESTAMP: if (opt->ts) { pp_ptr = optptr; goto error; } if (optlen < 4) { pp_ptr = optptr + 1; goto error; } if (optptr[2] < 5) { pp_ptr = optptr + 2; goto error; } if (optptr[2] <= optlen) { unsigned char *timeptr = NULL; if (optptr[2]+3 > optlen) { pp_ptr = optptr + 2; goto error; } switch (optptr[3]&0xF) { case IPOPT_TS_TSONLY: if (skb) timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; optptr[2] += 4; break; case IPOPT_TS_TSANDADDR: if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } if (rt) { spec_dst_fill(&spec_dst, skb); memcpy(&optptr[optptr[2]-1], &spec_dst, 4); timeptr = &optptr[optptr[2]+3]; } opt->ts_needaddr = 1; opt->ts_needtime = 1; optptr[2] += 8; break; case IPOPT_TS_PRESPEC: if (optptr[2]+7 > optlen) { pp_ptr = optptr + 2; goto error; } { __be32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); if (inet_addr_type(net, addr) == RTN_UNICAST) break; if (skb) timeptr = &optptr[optptr[2]+3]; } opt->ts_needtime = 1; optptr[2] += 8; break; default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr + 3; goto error; } break; } if (timeptr) { __be32 midtime; midtime = inet_current_timestamp(); memcpy(timeptr, &midtime, 4); opt->is_changed = 1; } } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) { unsigned int overflow = optptr[3]>>4; if (overflow == 15) { pp_ptr = optptr + 3; goto error; } if (skb) { optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); opt->is_changed = 1; } } opt->ts = optptr - iph; break; case IPOPT_RA: if (optlen < 4) { pp_ptr = optptr + 1; goto error; } if (optptr[2] == 0 && optptr[3] == 0) opt->router_alert = optptr - iph; break; case IPOPT_CIPSO: if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) { pp_ptr = optptr; goto error; } opt->cipso = optptr - iph; if (cipso_v4_validate(skb, &optptr)) { pp_ptr = optptr; goto error; } break; case IPOPT_SEC: case IPOPT_SID: default: if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr; goto error; } break; } l -= optlen; optptr += optlen; } eol: if (!pp_ptr) return 0; error: if (info) *info = htonl((pp_ptr-iph)<<24); return -EINVAL; } EXPORT_SYMBOL(__ip_options_compile); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb) { int ret; __be32 info; ret = __ip_options_compile(net, opt, skb, &info); if (ret != 0 && skb) icmp_send(skb, ICMP_PARAMETERPROB, 0, info); return ret; } EXPORT_SYMBOL(ip_options_compile); /* * Undo all the changes done by ip_options_compile(). */ void ip_options_undo(struct ip_options *opt) { if (opt->srr) { unsigned char *optptr = opt->__data + opt->srr - sizeof(struct iphdr); memmove(optptr + 7, optptr + 3, optptr[1] - 7); memcpy(optptr + 3, &opt->faddr, 4); } if (opt->rr_needaddr) { unsigned char *optptr = opt->__data + opt->rr - sizeof(struct iphdr); optptr[2] -= 4; memset(&optptr[optptr[2] - 1], 0, 4); } if (opt->ts) { unsigned char *optptr = opt->__data + opt->ts - sizeof(struct iphdr); if (opt->ts_needtime) { optptr[2] -= 4; memset(&optptr[optptr[2] - 1], 0, 4); if ((optptr[3] & 0xF) == IPOPT_TS_PRESPEC) optptr[2] -= 4; } if (opt->ts_needaddr) { optptr[2] -= 4; memset(&optptr[optptr[2] - 1], 0, 4); } } } int ip_options_get(struct net *net, struct ip_options_rcu **optp, sockptr_t data, int optlen) { struct ip_options_rcu *opt; opt = kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3), GFP_KERNEL); if (!opt) return -ENOMEM; if (optlen && copy_from_sockptr(opt->opt.__data, data, optlen)) { kfree(opt); return -EFAULT; } while (optlen & 3) opt->opt.__data[optlen++] = IPOPT_END; opt->opt.optlen = optlen; if (optlen && ip_options_compile(net, &opt->opt, NULL)) { kfree(opt); return -EINVAL; } kfree(*optp); *optp = opt; return 0; } void ip_forward_options(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); unsigned char *optptr; struct rtable *rt = skb_rtable(skb); unsigned char *raw = skb_network_header(skb); if (opt->rr_needaddr) { optptr = (unsigned char *)raw + opt->rr; ip_rt_get_source(&optptr[optptr[2]-5], skb, rt); opt->is_changed = 1; } if (opt->srr_is_hit) { int srrptr, srrspace; optptr = raw + opt->srr; for ( srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4 ) { if (srrptr + 3 > srrspace) break; if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0) break; } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; ip_hdr(skb)->daddr = opt->nexthop; ip_rt_get_source(&optptr[srrptr-1], skb, rt); optptr[2] = srrptr+4; } else { net_crit_ratelimited("%s(): Argh! Destination lost!\n", __func__); } if (opt->ts_needaddr) { optptr = raw + opt->ts; ip_rt_get_source(&optptr[optptr[2]-9], skb, rt); opt->is_changed = 1; } } if (opt->is_changed) { opt->is_changed = 0; ip_send_check(ip_hdr(skb)); } } int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) { struct ip_options *opt = &(IPCB(skb)->opt); int srrspace, srrptr; __be32 nexthop; struct iphdr *iph = ip_hdr(skb); unsigned char *optptr = skb_network_header(skb) + opt->srr; struct rtable *rt = skb_rtable(skb); struct rtable *rt2; unsigned long orefdst; int err; if (!rt) return 0; if (skb->pkt_type != PACKET_HOST) return -EINVAL; if (rt->rt_type == RTN_UNICAST) { if (!opt->is_strictroute) return 0; icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24)); return -EINVAL; } if (rt->rt_type != RTN_LOCAL) return -EINVAL; for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { if (srrptr + 3 > srrspace) { icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24)); return -EINVAL; } memcpy(&nexthop, &optptr[srrptr-1], 4); orefdst = skb_dstref_steal(skb); err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph), dev) ? -EINVAL : 0; rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); skb_dstref_restore(skb, orefdst); return -EINVAL; } refdst_drop(orefdst); if (rt2->rt_type != RTN_LOCAL) break; /* Superfast 8) loopback forward */ iph->daddr = nexthop; opt->is_changed = 1; } if (srrptr <= srrspace) { opt->srr_is_hit = 1; opt->nexthop = nexthop; opt->is_changed = 1; } return 0; } EXPORT_SYMBOL(ip_options_rcv_srr);
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 // SPDX-License-Identifier: GPL-2.0-only /* * IEEE 802.1Q GARP VLAN Registration Protocol (GVRP) * * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/if_vlan.h> #include <net/garp.h> #include "vlan.h" #define GARP_GVRP_ADDRESS { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x21 } enum gvrp_attributes { GVRP_ATTR_INVALID, GVRP_ATTR_VID, __GVRP_ATTR_MAX }; #define GVRP_ATTR_MAX (__GVRP_ATTR_MAX - 1) static struct garp_application vlan_gvrp_app __read_mostly = { .proto.group_address = GARP_GVRP_ADDRESS, .maxattr = GVRP_ATTR_MAX, .type = GARP_APPLICATION_GVRP, }; int vlan_gvrp_request_join(const struct net_device *dev) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); __be16 vlan_id = htons(vlan->vlan_id); if (vlan->vlan_proto != htons(ETH_P_8021Q)) return 0; return garp_request_join(vlan->real_dev, &vlan_gvrp_app, &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID); } void vlan_gvrp_request_leave(const struct net_device *dev) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); __be16 vlan_id = htons(vlan->vlan_id); if (vlan->vlan_proto != htons(ETH_P_8021Q)) return; garp_request_leave(vlan->real_dev, &vlan_gvrp_app, &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID); } int vlan_gvrp_init_applicant(struct net_device *dev) { return garp_init_applicant(dev, &vlan_gvrp_app); } void vlan_gvrp_uninit_applicant(struct net_device *dev) { garp_uninit_applicant(dev, &vlan_gvrp_app); } int __init vlan_gvrp_init(void) { return garp_register_application(&vlan_gvrp_app); } void vlan_gvrp_uninit(void) { garp_unregister_application(&vlan_gvrp_app); }
15 3 3 3 3 3 3 3 3 3 3 3 766 768 2 2 2 1 1 1 3 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 2 2 2 3 3 2 1 1 1 1 1 2 4 3 3 2 3 1 1 4 2 1 2 1 1 1 6 6 6 6 6 5 1 5 5 5 5 6 6 4 2 2 5 4 2 9 8 7 6 6 5 5 4 9 4 4 4 4 4 4 4 5 2 2 1 1 4 3 3 5 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 2 1 6 2 6 4 4 2 3 1 2 3 6 2 3 3 3 2 1 1 1 8 7 6 5 5 4 5 4 4 1 3 3 3 4 5 5 8 1 1 5 2 2 982 658 658 658 155 2 983 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 // SPDX-License-Identifier: GPL-2.0-only #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <net/busy_poll.h> #include <net/net_namespace.h> #include <net/netdev_queues.h> #include <net/netdev_rx_queue.h> #include <net/sock.h> #include <net/xdp.h> #include <net/xdp_sock.h> #include <net/page_pool/memory_provider.h> #include "dev.h" #include "devmem.h" #include "netdev-genl-gen.h" struct netdev_nl_dump_ctx { unsigned long ifindex; unsigned int rxq_idx; unsigned int txq_idx; unsigned int napi_id; }; static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) { NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx); return (struct netdev_nl_dump_ctx *)cb->ctx; } static int netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { u64 xsk_features = 0; u64 xdp_rx_meta = 0; void *hdr; netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */ hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; #define XDP_METADATA_KFUNC(_, flag, __, xmo) \ if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \ xdp_rx_meta |= flag; XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC if (netdev->xsk_tx_metadata_ops) { if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp) xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time) xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO; } if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES, netdev->xdp_features, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, xdp_rx_meta, NETDEV_A_DEV_PAD) || nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES, xsk_features, NETDEV_A_DEV_PAD)) goto err_cancel_msg; if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, netdev->xdp_zc_max_segs)) goto err_cancel_msg; } genlmsg_end(rsp, hdr); return 0; err_cancel_msg: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static void netdev_genl_dev_notify(struct net_device *netdev, int cmd) { struct genl_info info; struct sk_buff *ntf; if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev), NETDEV_NLGRP_MGMT)) return; genl_info_init_ntf(&info, &netdev_nl_family, cmd); ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; if (netdev_nl_dev_fill(netdev, ntf, &info)) { nlmsg_free(ntf); return; } genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf, 0, NETDEV_NLGRP_MGMT, GFP_KERNEL); } int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net_device *netdev; struct sk_buff *rsp; u32 ifindex; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); if (!netdev) { err = -ENODEV; goto err_free_msg; } err = netdev_nl_dev_fill(netdev, rsp, info); netdev_unlock(netdev); if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); int err; for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) return err; } return 0; } static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { unsigned long irq_suspend_timeout; unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; void *hdr; pid_t pid; if (!napi->dev->up) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) goto nla_put_failure; if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) goto nla_put_failure; if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq)) goto nla_put_failure; if (nla_put_uint(rsp, NETDEV_A_NAPI_THREADED, napi_get_threaded(napi))) goto nla_put_failure; if (napi->thread) { pid = task_pid_nr(napi->thread); if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid)) goto nla_put_failure; } napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi); if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS, napi_defer_hard_irqs)) goto nla_put_failure; irq_suspend_timeout = napi_get_irq_suspend_timeout(napi); if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, irq_suspend_timeout)) goto nla_put_failure; gro_flush_timeout = napi_get_gro_flush_timeout(napi); if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, gro_flush_timeout)) goto nla_put_failure; genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) { struct napi_struct *napi; struct sk_buff *rsp; u32 napi_id; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) return -EINVAL; napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_fill_one(rsp, napi, info); netdev_unlock(napi->dev); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); err = -ENOENT; } if (err) { goto err_free_msg; } else if (!rsp->len) { err = -ENOENT; goto err_free_msg; } return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { struct napi_struct *napi; unsigned int prev_id; int err = 0; if (!netdev->up) return err; prev_id = UINT_MAX; list_for_each_entry(napi, &netdev->napi_list, dev_list) { if (!napi_id_valid(napi->napi_id)) continue; /* Dump continuation below depends on the list being sorted */ WARN_ON_ONCE(napi->napi_id >= prev_id); prev_id = napi->napi_id; if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; err = netdev_nl_napi_fill_one(rsp, napi, info); if (err) return err; ctx->napi_id = napi->napi_id; } return err; } int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_NAPI_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); if (ifindex) { netdev = netdev_get_by_index_lock(net, ifindex); if (netdev) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); netdev_unlock(netdev); } else { err = -ENODEV; } } else { for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->napi_id = 0; } } return err; } static int netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) { u64 irq_suspend_timeout = 0; u64 gro_flush_timeout = 0; u8 threaded = 0; u32 defer = 0; if (info->attrs[NETDEV_A_NAPI_THREADED]) { int ret; threaded = nla_get_uint(info->attrs[NETDEV_A_NAPI_THREADED]); ret = napi_set_threaded(napi, threaded); if (ret) return ret; } if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); napi_set_defer_hard_irqs(napi, defer); } if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) { irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]); napi_set_irq_suspend_timeout(napi, irq_suspend_timeout); } if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); napi_set_gro_flush_timeout(napi, gro_flush_timeout); } return 0; } int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) { struct napi_struct *napi; unsigned int napi_id; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) return -EINVAL; napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_set_config(napi, info); netdev_unlock(napi->dev); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); err = -ENOENT; } return err; } static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi) { if (napi && napi_id_valid(napi->napi_id)) return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id); return 0; } static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { struct pp_memory_provider_params *params; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) || nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) || nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex)) goto nla_put_failure; switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; params = &rxq->mp_params; if (params->mp_ops && params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) goto nla_put_failure; #ifdef CONFIG_XDP_SOCKETS if (rxq->pool) if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) goto nla_put_failure; #endif break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); if (nla_put_napi_id(rsp, txq->napi)) goto nla_put_failure; #ifdef CONFIG_XDP_SOCKETS if (txq->pool) if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) goto nla_put_failure; #endif break; } genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id, u32 q_type) { switch (q_type) { case NETDEV_QUEUE_TYPE_RX: if (q_id >= netdev->real_num_rx_queues) return -EINVAL; return 0; case NETDEV_QUEUE_TYPE_TX: if (q_id >= netdev->real_num_tx_queues) return -EINVAL; } return 0; } static int netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { int err; if (!netdev->up) return -ENOENT; err = netdev_nl_queue_validate(netdev, q_idx, q_type); if (err) return err; return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info); } int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) { u32 q_id, q_type, ifindex; struct net_device *netdev; struct sk_buff *rsp; int err; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX)) return -EINVAL; q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]); q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]); ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info), ifindex); if (netdev) { err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); netdev_unlock_ops_compat(netdev); } else { err = -ENODEV; } if (err) goto err_free_msg; return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); return err; } static int netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { int err = 0; if (!netdev->up) return err; for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) { err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx, NETDEV_QUEUE_TYPE_RX, info); if (err) return err; } for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) { err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx, NETDEV_QUEUE_TYPE_TX, info); if (err) return err; } return err; } int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; u32 ifindex = 0; int err = 0; if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); if (ifindex) { netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); if (netdev) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); netdev_unlock_ops_compat(netdev); } else { err = -ENODEV; } } else { for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->rxq_idx = 0; ctx->txq_idx = 0; } } return err; } #define NETDEV_STAT_NOT_SET (~0ULL) static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size) { const u64 *add = _add; u64 *sum = _sum; while (size) { if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET) *sum += *add; sum++; add++; size -= 8; } } static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value) { if (value == NETDEV_STAT_NOT_SET) return 0; return nla_put_uint(rsp, attr_id, value); } static int netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits)) return -EMSGSIZE; return 0; } static int netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx) { if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) || netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake)) return -EMSGSIZE; return 0; } static int netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp, u32 q_type, int i, const struct genl_info *info) { const struct netdev_stat_ops *ops = netdev->stat_ops; struct netdev_queue_stats_rx rx; struct netdev_queue_stats_tx tx; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) || nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) || nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i)) goto nla_put_failure; switch (q_type) { case NETDEV_QUEUE_TYPE_RX: memset(&rx, 0xff, sizeof(rx)); ops->get_queue_stats_rx(netdev, i, &rx); if (!memchr_inv(&rx, 0xff, sizeof(rx))) goto nla_cancel; if (netdev_nl_stats_write_rx(rsp, &rx)) goto nla_put_failure; break; case NETDEV_QUEUE_TYPE_TX: memset(&tx, 0xff, sizeof(tx)); ops->get_queue_stats_tx(netdev, i, &tx); if (!memchr_inv(&tx, 0xff, sizeof(tx))) goto nla_cancel; if (netdev_nl_stats_write_tx(rsp, &tx)) goto nla_put_failure; break; } genlmsg_end(rsp, hdr); return 0; nla_cancel: genlmsg_cancel(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { const struct netdev_stat_ops *ops = netdev->stat_ops; int i, err; if (!(netdev->flags & IFF_UP)) return 0; i = ctx->rxq_idx; while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) { err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX, i, info); if (err) return err; ctx->rxq_idx = ++i; } i = ctx->txq_idx; while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX, i, info); if (err) return err; ctx->txq_idx = ++i; } ctx->rxq_idx = 0; ctx->txq_idx = 0; return 0; } /** * netdev_stat_queue_sum() - add up queue stats from range of queues * @netdev: net_device * @rx_start: index of the first Rx queue to query * @rx_end: index after the last Rx queue (first *not* to query) * @rx_sum: output Rx stats, should be already initialized * @tx_start: index of the first Tx queue to query * @tx_end: index after the last Tx queue (first *not* to query) * @tx_sum: output Tx stats, should be already initialized * * Add stats from [start, end) range of queue IDs to *x_sum structs. * The sum structs must be already initialized. Usually this * helper is invoked from the .get_base_stats callbacks of drivers * to account for stats of disabled queues. In that case the ranges * are usually [netdev->real_num_*x_queues, netdev->num_*x_queues). */ void netdev_stat_queue_sum(struct net_device *netdev, int rx_start, int rx_end, struct netdev_queue_stats_rx *rx_sum, int tx_start, int tx_end, struct netdev_queue_stats_tx *tx_sum) { const struct netdev_stat_ops *ops; struct netdev_queue_stats_rx rx; struct netdev_queue_stats_tx tx; int i; ops = netdev->stat_ops; for (i = rx_start; i < rx_end; i++) { memset(&rx, 0xff, sizeof(rx)); if (ops->get_queue_stats_rx) ops->get_queue_stats_rx(netdev, i, &rx); netdev_nl_stats_add(rx_sum, &rx, sizeof(rx)); } for (i = tx_start; i < tx_end; i++) { memset(&tx, 0xff, sizeof(tx)); if (ops->get_queue_stats_tx) ops->get_queue_stats_tx(netdev, i, &tx); netdev_nl_stats_add(tx_sum, &tx, sizeof(tx)); } } EXPORT_SYMBOL(netdev_stat_queue_sum); static int netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { struct netdev_queue_stats_rx rx_sum; struct netdev_queue_stats_tx tx_sum; void *hdr; /* Netdev can't guarantee any complete counters */ if (!netdev->stat_ops->get_base_stats) return 0; memset(&rx_sum, 0xff, sizeof(rx_sum)); memset(&tx_sum, 0xff, sizeof(tx_sum)); netdev->stat_ops->get_base_stats(netdev, &rx_sum, &tx_sum); /* The op was there, but nothing reported, don't bother */ if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) && !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum))) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex)) goto nla_put_failure; netdev_stat_queue_sum(netdev, 0, netdev->real_num_rx_queues, &rx_sum, 0, netdev->real_num_tx_queues, &tx_sum); if (netdev_nl_stats_write_rx(rsp, &rx_sum) || netdev_nl_stats_write_tx(rsp, &tx_sum)) goto nla_put_failure; genlmsg_end(rsp, hdr); return 0; nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static int netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope, struct sk_buff *skb, const struct genl_info *info, struct netdev_nl_dump_ctx *ctx) { if (!netdev->stat_ops) return 0; switch (scope) { case 0: return netdev_nl_stats_by_netdev(netdev, skb, info); case NETDEV_QSTATS_SCOPE_QUEUE: return netdev_nl_stats_by_queue(netdev, skb, info, ctx); } return -EINVAL; /* Should not happen, per netlink policy */ } int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; unsigned int ifindex; unsigned int scope; int err = 0; scope = 0; if (info->attrs[NETDEV_A_QSTATS_SCOPE]) scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]); ifindex = 0; if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); if (ifindex) { netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); if (!netdev) { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QSTATS_IFINDEX]); return -ENODEV; } if (netdev->stat_ops) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QSTATS_IFINDEX]); err = -EOPNOTSUPP; } netdev_unlock_ops_compat(netdev); return err; } for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); if (err < 0) break; } return err; } static int netdev_nl_read_rxq_bitmap(struct genl_info *info, u32 rxq_bitmap_len, unsigned long *rxq_bitmap) { const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; struct nlattr *attr; int rem, err = 0; u32 rxq_idx; nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, genlmsg_data(info->genlhdr), genlmsg_len(info->genlhdr), rem) { err = nla_parse_nested(tb, maxtype, attr, netdev_queue_id_nl_policy, info->extack); if (err < 0) return err; if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) return -EINVAL; if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); return -EINVAL; } rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); if (rxq_idx >= rxq_bitmap_len) { NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]); return -EINVAL; } bitmap_set(rxq_bitmap, rxq_idx, 1); } return 0; } static struct device * netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap, struct netlink_ext_ack *extack) { struct device *dma_dev = NULL; u32 rxq_idx, prev_rxq_idx; for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { struct device *rxq_dma_dev; rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx); if (dma_dev && rxq_dma_dev != dma_dev) { NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)", rxq_idx, prev_rxq_idx); return ERR_PTR(-EOPNOTSUPP); } dma_dev = rxq_dma_dev; prev_rxq_idx = rxq_idx; } return dma_dev; } int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) { struct net_devmem_dmabuf_binding *binding; u32 ifindex, dmabuf_fd, rxq_idx; struct netdev_nl_sock *priv; struct net_device *netdev; unsigned long *rxq_bitmap; struct device *dma_dev; struct sk_buff *rsp; int err = 0; void *hdr; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); if (IS_ERR(priv)) return PTR_ERR(priv); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; hdr = genlmsg_iput(rsp, info); if (!hdr) { err = -EMSGSIZE; goto err_genlmsg_free; } mutex_lock(&priv->lock); err = 0; netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); if (!netdev) { err = -ENODEV; goto err_unlock_sock; } if (!netif_device_present(netdev)) err = -ENODEV; else if (!netdev_need_ops_lock(netdev)) err = -EOPNOTSUPP; if (err) { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_DEV_IFINDEX]); goto err_unlock; } rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL); if (!rxq_bitmap) { err = -ENOMEM; goto err_unlock; } err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues, rxq_bitmap); if (err) goto err_rxq_bitmap; dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack); if (IS_ERR(dma_dev)) { err = PTR_ERR(dma_dev); goto err_rxq_bitmap; } binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE, dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_rxq_bitmap; } for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, info->extack); if (err) goto err_unbind; } nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); genlmsg_end(rsp, hdr); err = genlmsg_reply(rsp, info); if (err) goto err_unbind; bitmap_free(rxq_bitmap); netdev_unlock(netdev); mutex_unlock(&priv->lock); return 0; err_unbind: net_devmem_unbind_dmabuf(binding); err_rxq_bitmap: bitmap_free(rxq_bitmap); err_unlock: netdev_unlock(netdev); err_unlock_sock: mutex_unlock(&priv->lock); err_genlmsg_free: nlmsg_free(rsp); return err; } int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) { struct net_devmem_dmabuf_binding *binding; struct netdev_nl_sock *priv; struct net_device *netdev; struct device *dma_dev; u32 ifindex, dmabuf_fd; struct sk_buff *rsp; int err = 0; void *hdr; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) return -EINVAL; ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); if (IS_ERR(priv)) return PTR_ERR(priv); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) return -ENOMEM; hdr = genlmsg_iput(rsp, info); if (!hdr) { err = -EMSGSIZE; goto err_genlmsg_free; } mutex_lock(&priv->lock); netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); if (!netdev) { err = -ENODEV; goto err_unlock_sock; } if (!netif_device_present(netdev)) { err = -ENODEV; goto err_unlock_netdev; } if (!netdev->netmem_tx) { err = -EOPNOTSUPP; NL_SET_ERR_MSG(info->extack, "Driver does not support netmem TX"); goto err_unlock_netdev; } dma_dev = netdev_queue_get_dma_dev(netdev, 0); binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE, dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock_netdev; } nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); genlmsg_end(rsp, hdr); netdev_unlock(netdev); mutex_unlock(&priv->lock); return genlmsg_reply(rsp, info); err_unlock_netdev: netdev_unlock(netdev); err_unlock_sock: mutex_unlock(&priv->lock); err_genlmsg_free: nlmsg_free(rsp); return err; } void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { INIT_LIST_HEAD(&priv->bindings); mutex_init(&priv->lock); } void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv) { struct net_devmem_dmabuf_binding *binding; struct net_devmem_dmabuf_binding *temp; netdevice_tracker dev_tracker; struct net_device *dev; mutex_lock(&priv->lock); list_for_each_entry_safe(binding, temp, &priv->bindings, list) { mutex_lock(&binding->lock); dev = binding->dev; if (!dev) { mutex_unlock(&binding->lock); net_devmem_unbind_dmabuf(binding); continue; } netdev_hold(dev, &dev_tracker, GFP_KERNEL); mutex_unlock(&binding->lock); netdev_lock(dev); net_devmem_unbind_dmabuf(binding); netdev_unlock(dev); netdev_put(dev, &dev_tracker); } mutex_unlock(&priv->lock); } static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REGISTER: netdev_lock_ops_to_full(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); netdev_unlock_full_to_ops(netdev); break; case NETDEV_UNREGISTER: netdev_lock(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); netdev_unlock(netdev); break; case NETDEV_XDP_FEAT_CHANGE: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); break; } return NOTIFY_OK; } static struct notifier_block netdev_genl_nb = { .notifier_call = netdev_genl_netdevice_event, }; static int __init netdev_genl_init(void) { int err; err = register_netdevice_notifier(&netdev_genl_nb); if (err) return err; err = genl_register_family(&netdev_nl_family); if (err) goto err_unreg_ntf; return 0; err_unreg_ntf: unregister_netdevice_notifier(&netdev_genl_nb); return err; } subsys_initcall(netdev_genl_init);
5 5 8 8 742 748 59 497 497 496 126 125 126 126 126 126 2 2 2 2 2 2 2 7601 7643 7589 7636 7592 926 924 925 542 540 460 32 31 32 32 459 459 460 16 53 16 391 442 433 19 21 21 21 21 21 67 67 67 67 67 67 24 31 51 3366 3354 94 3197 3194 6169 6184 6166 6185 6183 6173 3612 3621 8438 8508 8421 8429 2 2 7908 69 67 733 2598 703 573 484 731 330 732 2575 80 34 50 50 50 50 50 50 50 50 135 135 135 134 135 14 14 14 17 17 17 13 13 13 13 10 10 7 1 1 7 7 7 7 7 7 5 2 3 1 2 3 3 3 5 5 8 4 2 3 4 3 3 3 1 1 1 1 2 1 1 4 8 3 3 1 1 8 8 3664 3653 13 13 12 75 76 75 75 75 76 76 76 14 14 14 14 14 1861 1866 1867 1863 809 811 808 810 14 15 15 78 78 78 78 78 78 78 78 88 88 88 88 87 1366 1925 1928 1366 1361 1366 1826 712 1263 1828 1826 1829 1829 1830 1827 146 146 145 146 146 78 146 146 452 450 454 451 453 12 443 609 607 610 614 609 615 83 84 84 83 4 80 91 91 90 90 91 2 89 5500 5500 5500 5499 5497 5521 5105 1313 159 159 160 160 12 149 26 32 2554 2545 2536 2555 32 2571 918 1649 99 4367 4362 2 3 2 2 2 1 2 2 2 3 3552 29 5 3 7 7 7 5 5 5 2 2 2 2 3 3 3 1 1 1 3 3 3 2 2 3 4 4 4 4 4 4 3 2 4 3 3 3 3 3 3 2 1 3 62 62 62 62 62 62 62 62 62 62 62 62 62 62 939 486 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor LSM hooks. * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. */ #include <linux/lsm_hooks.h> #include <linux/moduleparam.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/ptrace.h> #include <linux/ctype.h> #include <linux/sysctl.h> #include <linux/audit.h> #include <linux/user_namespace.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/zstd.h> #include <net/sock.h> #include <uapi/linux/mount.h> #include <uapi/linux/lsm.h> #include "include/af_unix.h" #include "include/apparmor.h" #include "include/apparmorfs.h" #include "include/audit.h" #include "include/capability.h" #include "include/cred.h" #include "include/crypto.h" #include "include/file.h" #include "include/ipc.h" #include "include/net.h" #include "include/path.h" #include "include/label.h" #include "include/policy.h" #include "include/policy_ns.h" #include "include/procattr.h" #include "include/mount.h" #include "include/secid.h" /* Flag indicating whether initialization completed */ int apparmor_initialized; union aa_buffer { struct list_head list; DECLARE_FLEX_ARRAY(char, buffer); }; struct aa_local_cache { unsigned int hold; unsigned int count; struct list_head head; }; #define RESERVE_COUNT 2 static int reserve_count = RESERVE_COUNT; static int buffer_count; static LIST_HEAD(aa_global_buffers); static DEFINE_SPINLOCK(aa_buffers_lock); static DEFINE_PER_CPU(struct aa_local_cache, aa_local_buffers); /* * LSM hook functions */ /* * put the associated labels */ static void apparmor_cred_free(struct cred *cred) { aa_put_label(cred_label(cred)); set_cred_label(cred, NULL); } /* * allocate the apparmor part of blank credentials */ static int apparmor_cred_alloc_blank(struct cred *cred, gfp_t gfp) { set_cred_label(cred, NULL); return 0; } /* * prepare new cred label for modification by prepare_cred block */ static int apparmor_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { set_cred_label(new, aa_get_newest_label(cred_label(old))); return 0; } /* * transfer the apparmor data to a blank set of creds */ static void apparmor_cred_transfer(struct cred *new, const struct cred *old) { set_cred_label(new, aa_get_newest_label(cred_label(old))); } static void apparmor_task_free(struct task_struct *task) { aa_free_task_ctx(task_ctx(task)); } static int apparmor_task_alloc(struct task_struct *task, u64 clone_flags) { struct aa_task_ctx *new = task_ctx(task); aa_dup_task_ctx(new, task_ctx(current)); return 0; } static int apparmor_ptrace_access_check(struct task_struct *child, unsigned int mode) { struct aa_label *tracer, *tracee; const struct cred *cred; int error; bool needput; cred = get_task_cred(child); tracee = cred_label(cred); /* ref count on cred */ tracer = __begin_current_label_crit_section(&needput); error = aa_may_ptrace(current_cred(), tracer, cred, tracee, (mode & PTRACE_MODE_READ) ? AA_PTRACE_READ : AA_PTRACE_TRACE); __end_current_label_crit_section(tracer, needput); put_cred(cred); return error; } static int apparmor_ptrace_traceme(struct task_struct *parent) { struct aa_label *tracer, *tracee; const struct cred *cred; int error; bool needput; tracee = __begin_current_label_crit_section(&needput); cred = get_task_cred(parent); tracer = cred_label(cred); /* ref count on cred */ error = aa_may_ptrace(cred, tracer, current_cred(), tracee, AA_PTRACE_TRACE); put_cred(cred); __end_current_label_crit_section(tracee, needput); return error; } /* Derived from security/commoncap.c:cap_capget */ static int apparmor_capget(const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { struct aa_label *label; const struct cred *cred; rcu_read_lock(); cred = __task_cred(target); label = aa_get_newest_cred_label(cred); /* * cap_capget is stacked ahead of this and will * initialize effective and permitted. */ if (!unconfined(label)) { struct aa_profile *profile; struct label_it i; label_for_each_confined(i, label, profile) { kernel_cap_t allowed; allowed = aa_profile_capget(profile); *effective = cap_intersect(*effective, allowed); *permitted = cap_intersect(*permitted, allowed); } } rcu_read_unlock(); aa_put_label(label); return 0; } static int apparmor_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { struct aa_label *label; int error = 0; label = aa_get_newest_cred_label(cred); if (!unconfined(label)) error = aa_capable(cred, label, cap, opts); aa_put_label(label); return error; } /** * common_perm - basic common permission check wrapper fn for paths * @op: operation being checked * @path: path to check permission of (NOT NULL) * @mask: requested permissions mask * @cond: conditional info for the permission request (NOT NULL) * * Returns: %0 else error code if error or permission denied */ static int common_perm(const char *op, const struct path *path, u32 mask, struct path_cond *cond) { struct aa_label *label; int error = 0; bool needput; label = __begin_current_label_crit_section(&needput); if (!unconfined(label)) error = aa_path_perm(op, current_cred(), label, path, 0, mask, cond); __end_current_label_crit_section(label, needput); return error; } /** * common_perm_cond - common permission wrapper around inode cond * @op: operation being checked * @path: location to check (NOT NULL) * @mask: requested permissions mask * * Returns: %0 else error code if error or permission denied */ static int common_perm_cond(const char *op, const struct path *path, u32 mask) { vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_idmap(path->mnt), d_backing_inode(path->dentry)); struct path_cond cond = { vfsuid_into_kuid(vfsuid), d_backing_inode(path->dentry)->i_mode }; if (!path_mediated_fs(path->dentry)) return 0; return common_perm(op, path, mask, &cond); } /** * common_perm_dir_dentry - common permission wrapper when path is dir, dentry * @op: operation being checked * @dir: directory of the dentry (NOT NULL) * @dentry: dentry to check (NOT NULL) * @mask: requested permissions mask * @cond: conditional info for the permission request (NOT NULL) * * Returns: %0 else error code if error or permission denied */ static int common_perm_dir_dentry(const char *op, const struct path *dir, struct dentry *dentry, u32 mask, struct path_cond *cond) { struct path path = { .mnt = dir->mnt, .dentry = dentry }; return common_perm(op, &path, mask, cond); } /** * common_perm_rm - common permission wrapper for operations doing rm * @op: operation being checked * @dir: directory that the dentry is in (NOT NULL) * @dentry: dentry being rm'd (NOT NULL) * @mask: requested permission mask * * Returns: %0 else error code if error or permission denied */ static int common_perm_rm(const char *op, const struct path *dir, struct dentry *dentry, u32 mask) { struct inode *inode = d_backing_inode(dentry); struct path_cond cond = { }; vfsuid_t vfsuid; if (!inode || !path_mediated_fs(dentry)) return 0; vfsuid = i_uid_into_vfsuid(mnt_idmap(dir->mnt), inode); cond.uid = vfsuid_into_kuid(vfsuid); cond.mode = inode->i_mode; return common_perm_dir_dentry(op, dir, dentry, mask, &cond); } /** * common_perm_create - common permission wrapper for operations doing create * @op: operation being checked * @dir: directory that dentry will be created in (NOT NULL) * @dentry: dentry to create (NOT NULL) * @mask: request permission mask * @mode: created file mode * * Returns: %0 else error code if error or permission denied */ static int common_perm_create(const char *op, const struct path *dir, struct dentry *dentry, u32 mask, umode_t mode) { struct path_cond cond = { current_fsuid(), mode }; if (!path_mediated_fs(dir->dentry)) return 0; return common_perm_dir_dentry(op, dir, dentry, mask, &cond); } static int apparmor_path_unlink(const struct path *dir, struct dentry *dentry) { return common_perm_rm(OP_UNLINK, dir, dentry, AA_MAY_DELETE); } static int apparmor_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode) { return common_perm_create(OP_MKDIR, dir, dentry, AA_MAY_CREATE, S_IFDIR); } static int apparmor_path_rmdir(const struct path *dir, struct dentry *dentry) { return common_perm_rm(OP_RMDIR, dir, dentry, AA_MAY_DELETE); } static int apparmor_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) { return common_perm_create(OP_MKNOD, dir, dentry, AA_MAY_CREATE, mode); } static int apparmor_path_truncate(const struct path *path) { return common_perm_cond(OP_TRUNC, path, MAY_WRITE | AA_MAY_SETATTR); } static int apparmor_file_truncate(struct file *file) { return apparmor_path_truncate(&file->f_path); } static int apparmor_path_symlink(const struct path *dir, struct dentry *dentry, const char *old_name) { return common_perm_create(OP_SYMLINK, dir, dentry, AA_MAY_CREATE, S_IFLNK); } static int apparmor_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { struct aa_label *label; int error = 0; if (!path_mediated_fs(old_dentry)) return 0; label = begin_current_label_crit_section(); if (!unconfined(label)) error = aa_path_link(current_cred(), label, old_dentry, new_dir, new_dentry); end_current_label_crit_section(label); return error; } static int apparmor_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, const unsigned int flags) { struct aa_label *label; int error = 0; if (!path_mediated_fs(old_dentry)) return 0; if ((flags & RENAME_EXCHANGE) && !path_mediated_fs(new_dentry)) return 0; label = begin_current_label_crit_section(); if (!unconfined(label)) { struct mnt_idmap *idmap = mnt_idmap(old_dir->mnt); vfsuid_t vfsuid; struct path old_path = { .mnt = old_dir->mnt, .dentry = old_dentry }; struct path new_path = { .mnt = new_dir->mnt, .dentry = new_dentry }; struct path_cond cond = { .mode = d_backing_inode(old_dentry)->i_mode }; vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond.uid = vfsuid_into_kuid(vfsuid); if (flags & RENAME_EXCHANGE) { struct path_cond cond_exchange = { .mode = d_backing_inode(new_dentry)->i_mode, }; vfsuid = i_uid_into_vfsuid(idmap, d_backing_inode(old_dentry)); cond_exchange.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_RENAME_SRC, current_cred(), label, &new_path, 0, MAY_READ | AA_MAY_GETATTR | MAY_WRITE | AA_MAY_SETATTR | AA_MAY_DELETE, &cond_exchange); if (!error) error = aa_path_perm(OP_RENAME_DEST, current_cred(), label, &old_path, 0, MAY_WRITE | AA_MAY_SETATTR | AA_MAY_CREATE, &cond_exchange); } if (!error) error = aa_path_perm(OP_RENAME_SRC, current_cred(), label, &old_path, 0, MAY_READ | AA_MAY_GETATTR | MAY_WRITE | AA_MAY_SETATTR | AA_MAY_DELETE, &cond); if (!error) error = aa_path_perm(OP_RENAME_DEST, current_cred(), label, &new_path, 0, MAY_WRITE | AA_MAY_SETATTR | AA_MAY_CREATE, &cond); } end_current_label_crit_section(label); return error; } static int apparmor_path_chmod(const struct path *path, umode_t mode) { return common_perm_cond(OP_CHMOD, path, AA_MAY_CHMOD); } static int apparmor_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { return common_perm_cond(OP_CHOWN, path, AA_MAY_CHOWN); } static int apparmor_inode_getattr(const struct path *path) { return common_perm_cond(OP_GETATTR, path, AA_MAY_GETATTR); } static int apparmor_file_open(struct file *file) { struct aa_file_ctx *fctx = file_ctx(file); struct aa_label *label; int error = 0; bool needput; if (!path_mediated_fs(file->f_path.dentry)) return 0; /* If in exec, permission is handled by bprm hooks. * Cache permissions granted by the previous exec check, with * implicit read and executable mmap which are required to * actually execute the image. * * Illogically, FMODE_EXEC is in f_flags, not f_mode. */ if (file->f_flags & __FMODE_EXEC) { fctx->allow = MAY_EXEC | MAY_READ | AA_EXEC_MMAP; return 0; } label = aa_get_newest_cred_label_condref(file->f_cred, &needput); if (!unconfined(label)) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); vfsuid_t vfsuid; struct path_cond cond = { .mode = inode->i_mode, }; vfsuid = i_uid_into_vfsuid(idmap, inode); cond.uid = vfsuid_into_kuid(vfsuid); error = aa_path_perm(OP_OPEN, file->f_cred, label, &file->f_path, 0, aa_map_file_to_perms(file), &cond); /* todo cache full allowed permissions set and state */ fctx->allow = aa_map_file_to_perms(file); } aa_put_label_condref(label, needput); return error; } static int apparmor_file_alloc_security(struct file *file) { struct aa_file_ctx *ctx = file_ctx(file); struct aa_label *label = begin_current_label_crit_section(); spin_lock_init(&ctx->lock); rcu_assign_pointer(ctx->label, aa_get_label(label)); end_current_label_crit_section(label); return 0; } static void apparmor_file_free_security(struct file *file) { struct aa_file_ctx *ctx = file_ctx(file); if (ctx) aa_put_label(rcu_access_pointer(ctx->label)); } static int common_file_perm(const char *op, struct file *file, u32 mask, bool in_atomic) { struct aa_label *label; int error = 0; bool needput; /* don't reaudit files closed during inheritance */ if (unlikely(file->f_path.dentry == aa_null.dentry)) return -EACCES; label = __begin_current_label_crit_section(&needput); error = aa_file_perm(op, current_cred(), label, file, mask, in_atomic); __end_current_label_crit_section(label, needput); return error; } static int apparmor_file_receive(struct file *file) { return common_file_perm(OP_FRECEIVE, file, aa_map_file_to_perms(file), false); } static int apparmor_file_permission(struct file *file, int mask) { return common_file_perm(OP_FPERM, file, mask, false); } static int apparmor_file_lock(struct file *file, unsigned int cmd) { u32 mask = AA_MAY_LOCK; if (cmd == F_WRLCK) mask |= MAY_WRITE; return common_file_perm(OP_FLOCK, file, mask, false); } static int common_mmap(const char *op, struct file *file, unsigned long prot, unsigned long flags, bool in_atomic) { int mask = 0; if (!file || !file_ctx(file)) return 0; if (prot & PROT_READ) mask |= MAY_READ; /* * Private mappings don't require write perms since they don't * write back to the files */ if ((prot & PROT_WRITE) && !(flags & MAP_PRIVATE)) mask |= MAY_WRITE; if (prot & PROT_EXEC) mask |= AA_EXEC_MMAP; return common_file_perm(op, file, mask, in_atomic); } static int apparmor_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { return common_mmap(OP_FMMAP, file, prot, flags, GFP_ATOMIC); } static int apparmor_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { return common_mmap(OP_FMPROT, vma->vm_file, prot, !(vma->vm_flags & VM_SHARED) ? MAP_PRIVATE : 0, false); } #ifdef CONFIG_IO_URING static const char *audit_uring_mask(u32 mask) { if (mask & AA_MAY_CREATE_SQPOLL) return "sqpoll"; if (mask & AA_MAY_OVERRIDE_CRED) return "override_creds"; return ""; } static void audit_uring_cb(struct audit_buffer *ab, void *va) { struct apparmor_audit_data *ad = aad_of_va(va); if (ad->request & AA_URING_PERM_MASK) { audit_log_format(ab, " requested=\"%s\"", audit_uring_mask(ad->request)); if (ad->denied & AA_URING_PERM_MASK) { audit_log_format(ab, " denied=\"%s\"", audit_uring_mask(ad->denied)); } } if (ad->uring.target) { audit_log_format(ab, " tcontext="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->uring.target, FLAGS_NONE, GFP_ATOMIC); } } static int profile_uring(struct aa_profile *profile, u32 request, struct aa_label *new, int cap, struct apparmor_audit_data *ad) { unsigned int state; struct aa_ruleset *rules; int error = 0; AA_BUG(!profile); rules = profile->label.rules[0]; state = RULE_MEDIATES(rules, AA_CLASS_IO_URING); if (state) { struct aa_perms perms = { }; if (new) { aa_label_match(profile, rules, new, state, false, request, &perms); } else { perms = *aa_lookup_perms(rules->policy, state); } aa_apply_modes_to_perms(profile, &perms); error = aa_check_perms(profile, &perms, request, ad, audit_uring_cb); } return error; } /** * apparmor_uring_override_creds - check the requested cred override * @new: the target creds * * Check to see if the current task is allowed to override it's credentials * to service an io_uring operation. */ static int apparmor_uring_override_creds(const struct cred *new) { struct aa_profile *profile; struct aa_label *label; int error; bool needput; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_OVERRIDE); ad.uring.target = cred_label(new); label = __begin_current_label_crit_section(&needput); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_OVERRIDE_CRED, cred_label(new), CAP_SYS_ADMIN, &ad)); __end_current_label_crit_section(label, needput); return error; } /** * apparmor_uring_sqpoll - check if a io_uring polling thread can be created * * Check to see if the current task is allowed to create a new io_uring * kernel polling thread. */ static int apparmor_uring_sqpoll(void) { struct aa_profile *profile; struct aa_label *label; int error; bool needput; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_IO_URING, OP_URING_SQPOLL); label = __begin_current_label_crit_section(&needput); error = fn_for_each(label, profile, profile_uring(profile, AA_MAY_CREATE_SQPOLL, NULL, CAP_SYS_ADMIN, &ad)); __end_current_label_crit_section(label, needput); return error; } #endif /* CONFIG_IO_URING */ static int apparmor_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { struct aa_label *label; int error = 0; bool needput; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; flags &= ~AA_MS_IGNORE_MASK; label = __begin_current_label_crit_section(&needput); if (!unconfined(label)) { if (flags & MS_REMOUNT) error = aa_remount(current_cred(), label, path, flags, data); else if (flags & MS_BIND) error = aa_bind_mount(current_cred(), label, path, dev_name, flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) error = aa_mount_change_type(current_cred(), label, path, flags); else if (flags & MS_MOVE) error = aa_move_mount_old(current_cred(), label, path, dev_name); else error = aa_new_mount(current_cred(), label, dev_name, path, type, flags, data); } __end_current_label_crit_section(label, needput); return error; } static int apparmor_move_mount(const struct path *from_path, const struct path *to_path) { struct aa_label *label; int error = 0; bool needput; label = __begin_current_label_crit_section(&needput); if (!unconfined(label)) error = aa_move_mount(current_cred(), label, from_path, to_path); __end_current_label_crit_section(label, needput); return error; } static int apparmor_sb_umount(struct vfsmount *mnt, int flags) { struct aa_label *label; int error = 0; bool needput; label = __begin_current_label_crit_section(&needput); if (!unconfined(label)) error = aa_umount(current_cred(), label, mnt, flags); __end_current_label_crit_section(label, needput); return error; } static int apparmor_sb_pivotroot(const struct path *old_path, const struct path *new_path) { struct aa_label *label; int error = 0; label = aa_get_current_label(); if (!unconfined(label)) error = aa_pivotroot(current_cred(), label, old_path, new_path); aa_put_label(label); return error; } static int apparmor_getselfattr(unsigned int attr, struct lsm_ctx __user *lx, u32 *size, u32 flags) { int error = -ENOENT; struct aa_task_ctx *ctx = task_ctx(current); struct aa_label *label = NULL; char *value = NULL; switch (attr) { case LSM_ATTR_CURRENT: label = aa_get_newest_label(cred_label(current_cred())); break; case LSM_ATTR_PREV: if (ctx->previous) label = aa_get_newest_label(ctx->previous); break; case LSM_ATTR_EXEC: if (ctx->onexec) label = aa_get_newest_label(ctx->onexec); break; default: error = -EOPNOTSUPP; break; } if (label) { error = aa_getprocattr(label, &value, false); if (error > 0) error = lsm_fill_user_ctx(lx, size, value, error, LSM_ID_APPARMOR, 0); kfree(value); } aa_put_label(label); if (error < 0) return error; return 1; } static int apparmor_getprocattr(struct task_struct *task, const char *name, char **value) { int error = -ENOENT; /* released below */ const struct cred *cred = get_task_cred(task); struct aa_task_ctx *ctx = task_ctx(current); struct aa_label *label = NULL; if (strcmp(name, "current") == 0) label = aa_get_newest_label(cred_label(cred)); else if (strcmp(name, "prev") == 0 && ctx->previous) label = aa_get_newest_label(ctx->previous); else if (strcmp(name, "exec") == 0 && ctx->onexec) label = aa_get_newest_label(ctx->onexec); else error = -EINVAL; if (label) error = aa_getprocattr(label, value, true); aa_put_label(label); put_cred(cred); return error; } static int do_setattr(u64 attr, void *value, size_t size) { char *command, *largs = NULL, *args = value; size_t arg_size; int error; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_NONE, OP_SETPROCATTR); if (size == 0) return -EINVAL; /* AppArmor requires that the buffer must be null terminated atm */ if (args[size - 1] != '\0') { /* null terminate */ largs = args = kmalloc(size + 1, GFP_KERNEL); if (!args) return -ENOMEM; memcpy(args, value, size); args[size] = '\0'; } error = -EINVAL; args = strim(args); command = strsep(&args, " "); if (!args) goto out; args = skip_spaces(args); if (!*args) goto out; arg_size = size - (args - (largs ? largs : (char *) value)); if (attr == LSM_ATTR_CURRENT) { if (strcmp(command, "changehat") == 0) { error = aa_setprocattr_changehat(args, arg_size, AA_CHANGE_NOFLAGS); } else if (strcmp(command, "permhat") == 0) { error = aa_setprocattr_changehat(args, arg_size, AA_CHANGE_TEST); } else if (strcmp(command, "changeprofile") == 0) { error = aa_change_profile(args, AA_CHANGE_NOFLAGS); } else if (strcmp(command, "permprofile") == 0) { error = aa_change_profile(args, AA_CHANGE_TEST); } else if (strcmp(command, "stack") == 0) { error = aa_change_profile(args, AA_CHANGE_STACK); } else goto fail; } else if (attr == LSM_ATTR_EXEC) { if (strcmp(command, "exec") == 0) error = aa_change_profile(args, AA_CHANGE_ONEXEC); else if (strcmp(command, "stack") == 0) error = aa_change_profile(args, (AA_CHANGE_ONEXEC | AA_CHANGE_STACK)); else goto fail; } else /* only support the "current" and "exec" process attributes */ goto fail; if (!error) error = size; out: kfree(largs); return error; fail: ad.subj_label = begin_current_label_crit_section(); if (attr == LSM_ATTR_CURRENT) ad.info = "current"; else if (attr == LSM_ATTR_EXEC) ad.info = "exec"; else ad.info = "invalid"; ad.error = error = -EINVAL; aa_audit_msg(AUDIT_APPARMOR_DENIED, &ad, NULL); end_current_label_crit_section(ad.subj_label); goto out; } static int apparmor_setselfattr(unsigned int attr, struct lsm_ctx *ctx, u32 size, u32 flags) { int rc; if (attr != LSM_ATTR_CURRENT && attr != LSM_ATTR_EXEC) return -EOPNOTSUPP; rc = do_setattr(attr, ctx->ctx, ctx->ctx_len); if (rc > 0) return 0; return rc; } static int apparmor_setprocattr(const char *name, void *value, size_t size) { int attr = lsm_name_to_attr(name); if (attr) return do_setattr(attr, value, size); return -EINVAL; } /** * apparmor_bprm_committing_creds - do task cleanup on committing new creds * @bprm: binprm for the exec (NOT NULL) */ static void apparmor_bprm_committing_creds(const struct linux_binprm *bprm) { struct aa_label *label = aa_current_raw_label(); struct aa_label *new_label = cred_label(bprm->cred); /* bail out if unconfined or not changing profile */ if ((new_label->proxy == label->proxy) || (unconfined(new_label))) return; aa_inherit_files(bprm->cred, current->files); current->pdeath_signal = 0; /* reset soft limits and set hard limits for the new label */ __aa_transition_rlimits(label, new_label); } /** * apparmor_bprm_committed_creds() - do cleanup after new creds committed * @bprm: binprm for the exec (NOT NULL) */ static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm) { /* clear out temporary/transitional state from the context */ aa_clear_task_ctx_trans(task_ctx(current)); return; } static void apparmor_current_getlsmprop_subj(struct lsm_prop *prop) { struct aa_label *label; bool needput; label = __begin_current_label_crit_section(&needput); prop->apparmor.label = label; __end_current_label_crit_section(label, needput); } static void apparmor_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop) { struct aa_label *label = aa_get_task_label(p); prop->apparmor.label = label; aa_put_label(label); } static int apparmor_task_setrlimit(struct task_struct *task, unsigned int resource, struct rlimit *new_rlim) { struct aa_label *label; int error = 0; bool needput; label = __begin_current_label_crit_section(&needput); if (!unconfined(label)) error = aa_task_setrlimit(current_cred(), label, task, resource, new_rlim); __end_current_label_crit_section(label, needput); return error; } static int apparmor_task_kill(struct task_struct *target, struct kernel_siginfo *info, int sig, const struct cred *cred) { const struct cred *tc; struct aa_label *cl, *tl; int error; bool needput; tc = get_task_cred(target); tl = aa_get_newest_cred_label(tc); if (cred) { /* * Dealing with USB IO specific behavior */ cl = aa_get_newest_cred_label(cred); error = aa_may_signal(cred, cl, tc, tl, sig); aa_put_label(cl); } else { cl = __begin_current_label_crit_section(&needput); error = aa_may_signal(current_cred(), cl, tc, tl, sig); __end_current_label_crit_section(cl, needput); } aa_put_label(tl); put_cred(tc); return error; } static int apparmor_userns_create(const struct cred *cred) { struct aa_label *label; struct aa_profile *profile; int error = 0; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_TASK, AA_CLASS_NS, OP_USERNS_CREATE); ad.subj_cred = current_cred(); label = begin_current_label_crit_section(); if (!unconfined(label)) { error = fn_for_each(label, profile, aa_profile_ns_perm(profile, &ad, AA_USERNS_CREATE)); } end_current_label_crit_section(label); return error; } static int apparmor_sk_alloc_security(struct sock *sk, int family, gfp_t gfp) { struct aa_sk_ctx *ctx = aa_sock(sk); struct aa_label *label; bool needput; label = __begin_current_label_crit_section(&needput); //spin_lock_init(&ctx->lock); rcu_assign_pointer(ctx->label, aa_get_label(label)); rcu_assign_pointer(ctx->peer, NULL); rcu_assign_pointer(ctx->peer_lastupdate, NULL); __end_current_label_crit_section(label, needput); return 0; } static void apparmor_sk_free_security(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); /* dead these won't be updated any more */ aa_put_label(rcu_dereference_protected(ctx->label, true)); aa_put_label(rcu_dereference_protected(ctx->peer, true)); aa_put_label(rcu_dereference_protected(ctx->peer_lastupdate, true)); } /** * apparmor_sk_clone_security - clone the sk_security field * @sk: sock to have security cloned * @newsk: sock getting clone */ static void apparmor_sk_clone_security(const struct sock *sk, struct sock *newsk) { struct aa_sk_ctx *ctx = aa_sock(sk); struct aa_sk_ctx *new = aa_sock(newsk); /* not actually in use yet */ if (rcu_access_pointer(ctx->label) != rcu_access_pointer(new->label)) { aa_put_label(rcu_dereference_protected(new->label, true)); rcu_assign_pointer(new->label, aa_get_label_rcu(&ctx->label)); } if (rcu_access_pointer(ctx->peer) != rcu_access_pointer(new->peer)) { aa_put_label(rcu_dereference_protected(new->peer, true)); rcu_assign_pointer(new->peer, aa_get_label_rcu(&ctx->peer)); } if (rcu_access_pointer(ctx->peer_lastupdate) != rcu_access_pointer(new->peer_lastupdate)) { aa_put_label(rcu_dereference_protected(new->peer_lastupdate, true)); rcu_assign_pointer(new->peer_lastupdate, aa_get_label_rcu(&ctx->peer_lastupdate)); } } static int unix_connect_perm(const struct cred *cred, struct aa_label *label, struct sock *sk, struct sock *peer_sk) { struct aa_sk_ctx *peer_ctx = aa_sock(peer_sk); int error; error = aa_unix_peer_perm(cred, label, OP_CONNECT, (AA_MAY_CONNECT | AA_MAY_SEND | AA_MAY_RECEIVE), sk, peer_sk, rcu_dereference_protected(peer_ctx->label, lockdep_is_held(&unix_sk(peer_sk)->lock))); if (!is_unix_fs(peer_sk)) { last_error(error, aa_unix_peer_perm(cred, rcu_dereference_protected(peer_ctx->label, lockdep_is_held(&unix_sk(peer_sk)->lock)), OP_CONNECT, (AA_MAY_ACCEPT | AA_MAY_SEND | AA_MAY_RECEIVE), peer_sk, sk, label)); } return error; } /* lockdep check in unix_connect_perm - push sks here to check */ static void unix_connect_peers(struct aa_sk_ctx *sk_ctx, struct aa_sk_ctx *peer_ctx) { /* Cross reference the peer labels for SO_PEERSEC */ struct aa_label *label = rcu_dereference_protected(sk_ctx->label, true); aa_get_label(label); aa_put_label(rcu_dereference_protected(peer_ctx->peer, true)); rcu_assign_pointer(peer_ctx->peer, label); /* transfer cnt */ label = aa_get_label(rcu_dereference_protected(peer_ctx->label, true)); //spin_unlock(&peer_ctx->lock); //spin_lock(&sk_ctx->lock); aa_put_label(rcu_dereference_protected(sk_ctx->peer, true)); aa_put_label(rcu_dereference_protected(sk_ctx->peer_lastupdate, true)); rcu_assign_pointer(sk_ctx->peer, aa_get_label(label)); rcu_assign_pointer(sk_ctx->peer_lastupdate, label); /* transfer cnt */ //spin_unlock(&sk_ctx->lock); } /** * apparmor_unix_stream_connect - check perms before making unix domain conn * @sk: sk attempting to connect * @peer_sk: sk that is accepting the connection * @newsk: new sk created for this connection * peer is locked when this hook is called * * Return: * 0 if connection is permitted * error code on denial or failure */ static int apparmor_unix_stream_connect(struct sock *sk, struct sock *peer_sk, struct sock *newsk) { struct aa_sk_ctx *sk_ctx = aa_sock(sk); struct aa_sk_ctx *peer_ctx = aa_sock(peer_sk); struct aa_sk_ctx *new_ctx = aa_sock(newsk); struct aa_label *label; int error; bool needput; label = __begin_current_label_crit_section(&needput); error = unix_connect_perm(current_cred(), label, sk, peer_sk); __end_current_label_crit_section(label, needput); if (error) return error; /* newsk doesn't go through post_create, but does go through * security_sk_alloc() */ rcu_assign_pointer(new_ctx->label, aa_get_label(rcu_dereference_protected(peer_ctx->label, true))); /* Cross reference the peer labels for SO_PEERSEC */ unix_connect_peers(sk_ctx, new_ctx); return 0; } /** * apparmor_unix_may_send - check perms before conn or sending unix dgrams * @sock: socket sending the message * @peer: socket message is being send to * * Performs bidirectional permission checks for Unix domain socket communication: * 1. Verifies sender has AA_MAY_SEND to target socket * 2. Verifies receiver has AA_MAY_RECEIVE from source socket * * sock and peer are locked when this hook is called * called by: dgram_connect peer setup but path not copied to newsk * * Return: * 0 if transmission is permitted * error code on denial or failure */ static int apparmor_unix_may_send(struct socket *sock, struct socket *peer) { struct aa_sk_ctx *peer_ctx = aa_sock(peer->sk); struct aa_label *label; int error; bool needput; label = __begin_current_label_crit_section(&needput); error = xcheck(aa_unix_peer_perm(current_cred(), label, OP_SENDMSG, AA_MAY_SEND, sock->sk, peer->sk, rcu_dereference_protected(peer_ctx->label, true)), aa_unix_peer_perm(peer->file ? peer->file->f_cred : NULL, rcu_dereference_protected(peer_ctx->label, true), OP_SENDMSG, AA_MAY_RECEIVE, peer->sk, sock->sk, label)); __end_current_label_crit_section(label, needput); return error; } static int apparmor_socket_create(int family, int type, int protocol, int kern) { struct aa_label *label; int error = 0; AA_BUG(in_interrupt()); if (kern) return 0; label = begin_current_label_crit_section(); if (!unconfined(label)) { if (family == PF_UNIX) error = aa_unix_create_perm(label, family, type, protocol); else error = aa_af_perm(current_cred(), label, OP_CREATE, AA_MAY_CREATE, family, type, protocol); } end_current_label_crit_section(label); return error; } /** * apparmor_socket_post_create - setup the per-socket security struct * @sock: socket that is being setup * @family: family of socket being created * @type: type of the socket * @protocol: protocol of the socket * @kern: socket is a special kernel socket * * Note: * - kernel sockets labeled kernel_t used to use unconfined * - socket may not have sk here if created with sock_create_lite or * sock_alloc. These should be accept cases which will be handled in * sock_graft. */ static int apparmor_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { struct aa_label *label; if (kern) { label = aa_get_label(kernel_t); } else label = aa_get_current_label(); if (sock->sk) { struct aa_sk_ctx *ctx = aa_sock(sock->sk); /* still not live */ aa_put_label(rcu_dereference_protected(ctx->label, true)); rcu_assign_pointer(ctx->label, aa_get_label(label)); } aa_put_label(label); return 0; } static int apparmor_socket_socketpair(struct socket *socka, struct socket *sockb) { struct aa_sk_ctx *a_ctx = aa_sock(socka->sk); struct aa_sk_ctx *b_ctx = aa_sock(sockb->sk); struct aa_label *label; /* socks not live yet - initial values set in sk_alloc */ label = begin_current_label_crit_section(); if (rcu_access_pointer(a_ctx->label) != label) { AA_BUG("a_ctx != label"); aa_put_label(rcu_dereference_protected(a_ctx->label, true)); rcu_assign_pointer(a_ctx->label, aa_get_label(label)); } if (rcu_access_pointer(b_ctx->label) != label) { AA_BUG("b_ctx != label"); aa_put_label(rcu_dereference_protected(b_ctx->label, true)); rcu_assign_pointer(b_ctx->label, aa_get_label(label)); } if (socka->sk->sk_family == PF_UNIX) { /* unix socket pairs by-pass unix_stream_connect */ unix_connect_peers(a_ctx, b_ctx); } end_current_label_crit_section(label); return 0; } /** * apparmor_socket_bind - check perms before bind addr to socket * @sock: socket to bind the address to (must be non-NULL) * @address: address that is being bound (must be non-NULL) * @addrlen: length of @address * * Performs security checks before allowing a socket to bind to an address. * Handles Unix domain sockets specially through aa_unix_bind_perm(). * For other socket families, uses generic permission check via aa_sk_perm(). * * Return: * 0 if binding is permitted * error code on denial or invalid parameters */ static int apparmor_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!address); AA_BUG(in_interrupt()); if (sock->sk->sk_family == PF_UNIX) return aa_unix_bind_perm(sock, address, addrlen); return aa_sk_perm(OP_BIND, AA_MAY_BIND, sock->sk); } static int apparmor_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!address); AA_BUG(in_interrupt()); /* PF_UNIX goes through unix_stream_connect && unix_may_send */ if (sock->sk->sk_family == PF_UNIX) return 0; return aa_sk_perm(OP_CONNECT, AA_MAY_CONNECT, sock->sk); } static int apparmor_socket_listen(struct socket *sock, int backlog) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); if (sock->sk->sk_family == PF_UNIX) return aa_unix_listen_perm(sock, backlog); return aa_sk_perm(OP_LISTEN, AA_MAY_LISTEN, sock->sk); } /* * Note: while @newsock is created and has some information, the accept * has not been done. */ static int apparmor_socket_accept(struct socket *sock, struct socket *newsock) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!newsock); AA_BUG(in_interrupt()); if (sock->sk->sk_family == PF_UNIX) return aa_unix_accept_perm(sock, newsock); return aa_sk_perm(OP_ACCEPT, AA_MAY_ACCEPT, sock->sk); } static int aa_sock_msg_perm(const char *op, u32 request, struct socket *sock, struct msghdr *msg, int size) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(!msg); AA_BUG(in_interrupt()); /* PF_UNIX goes through unix_may_send */ if (sock->sk->sk_family == PF_UNIX) return 0; return aa_sk_perm(op, request, sock->sk); } static int apparmor_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return aa_sock_msg_perm(OP_SENDMSG, AA_MAY_SEND, sock, msg, size); } static int apparmor_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return aa_sock_msg_perm(OP_RECVMSG, AA_MAY_RECEIVE, sock, msg, size); } /* revaliation, get/set attr, shutdown */ static int aa_sock_perm(const char *op, u32 request, struct socket *sock) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); if (sock->sk->sk_family == PF_UNIX) return aa_unix_sock_perm(op, request, sock); return aa_sk_perm(op, request, sock->sk); } static int apparmor_socket_getsockname(struct socket *sock) { return aa_sock_perm(OP_GETSOCKNAME, AA_MAY_GETATTR, sock); } static int apparmor_socket_getpeername(struct socket *sock) { return aa_sock_perm(OP_GETPEERNAME, AA_MAY_GETATTR, sock); } /* revaliation, get/set attr, opt */ static int aa_sock_opt_perm(const char *op, u32 request, struct socket *sock, int level, int optname) { AA_BUG(!sock); AA_BUG(!sock->sk); AA_BUG(in_interrupt()); if (sock->sk->sk_family == PF_UNIX) return aa_unix_opt_perm(op, request, sock, level, optname); return aa_sk_perm(op, request, sock->sk); } static int apparmor_socket_getsockopt(struct socket *sock, int level, int optname) { return aa_sock_opt_perm(OP_GETSOCKOPT, AA_MAY_GETOPT, sock, level, optname); } static int apparmor_socket_setsockopt(struct socket *sock, int level, int optname) { return aa_sock_opt_perm(OP_SETSOCKOPT, AA_MAY_SETOPT, sock, level, optname); } static int apparmor_socket_shutdown(struct socket *sock, int how) { return aa_sock_perm(OP_SHUTDOWN, AA_MAY_SHUTDOWN, sock); } #ifdef CONFIG_NETWORK_SECMARK /** * apparmor_socket_sock_rcv_skb - check perms before associating skb to sk * @sk: sk to associate @skb with * @skb: skb to check for perms * * Note: can not sleep may be called with locks held * * dont want protocol specific in __skb_recv_datagram() * to deny an incoming connection socket_sock_rcv_skb() */ static int apparmor_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct aa_sk_ctx *ctx = aa_sock(sk); int error; if (!skb->secmark) return 0; /* * If reach here before socket_post_create hook is called, in which * case label is null, drop the packet. */ if (!rcu_access_pointer(ctx->label)) return -EACCES; rcu_read_lock(); error = apparmor_secmark_check(rcu_dereference(ctx->label), OP_RECVMSG, AA_MAY_RECEIVE, skb->secmark, sk); rcu_read_unlock(); return error; } #endif static struct aa_label *sk_peer_get_label(struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); struct aa_label *label = ERR_PTR(-ENOPROTOOPT); if (rcu_access_pointer(ctx->peer)) return aa_get_label_rcu(&ctx->peer); if (sk->sk_family != PF_UNIX) return ERR_PTR(-ENOPROTOOPT); return label; } /** * apparmor_socket_getpeersec_stream - get security context of peer * @sock: socket that we are trying to get the peer context of * @optval: output - buffer to copy peer name to * @optlen: output - size of copied name in @optval * @len: size of @optval buffer * Returns: 0 on success, -errno of failure * * Note: for tcp only valid if using ipsec or cipso on lan */ static int apparmor_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { char *name = NULL; int slen, error = 0; struct aa_label *label; struct aa_label *peer; peer = sk_peer_get_label(sock->sk); if (IS_ERR(peer)) { error = PTR_ERR(peer); goto done; } label = begin_current_label_crit_section(); slen = aa_label_asxprint(&name, labels_ns(label), peer, FLAG_SHOW_MODE | FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED, GFP_KERNEL); /* don't include terminating \0 in slen, it breaks some apps */ if (slen < 0) { error = -ENOMEM; goto done_put; } if (slen > len) { error = -ERANGE; goto done_len; } if (copy_to_sockptr(optval, name, slen)) error = -EFAULT; done_len: if (copy_to_sockptr(optlen, &slen, sizeof(slen))) error = -EFAULT; done_put: end_current_label_crit_section(label); aa_put_label(peer); done: kfree(name); return error; } /** * apparmor_socket_getpeersec_dgram - get security label of packet * @sock: the peer socket * @skb: packet data * @secid: pointer to where to put the secid of the packet * * Sets the netlabel socket state on sk from parent */ static int apparmor_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { /* TODO: requires secid support */ return -ENOPROTOOPT; } /** * apparmor_sock_graft - Initialize newly created socket * @sk: child sock * @parent: parent socket * * Note: could set off of SOCK_CTX(parent) but need to track inode and we can * just set sk security information off of current creating process label * Labeling of sk for accept case - probably should be sock based * instead of task, because of the case where an implicitly labeled * socket is shared by different tasks. */ static void apparmor_sock_graft(struct sock *sk, struct socket *parent) { struct aa_sk_ctx *ctx = aa_sock(sk); /* setup - not live */ if (!rcu_access_pointer(ctx->label)) rcu_assign_pointer(ctx->label, aa_get_current_label()); } #ifdef CONFIG_NETWORK_SECMARK static int apparmor_inet_conn_request(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct aa_sk_ctx *ctx = aa_sock(sk); int error; if (!skb->secmark) return 0; rcu_read_lock(); error = apparmor_secmark_check(rcu_dereference(ctx->label), OP_CONNECT, AA_MAY_CONNECT, skb->secmark, sk); rcu_read_unlock(); return error; } #endif /* * The cred blob is a pointer to, not an instance of, an aa_label. */ struct lsm_blob_sizes apparmor_blob_sizes __ro_after_init = { .lbs_cred = sizeof(struct aa_label *), .lbs_file = sizeof(struct aa_file_ctx), .lbs_task = sizeof(struct aa_task_ctx), .lbs_sock = sizeof(struct aa_sk_ctx), }; static const struct lsm_id apparmor_lsmid = { .name = "apparmor", .id = LSM_ID_APPARMOR, }; static struct security_hook_list apparmor_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, apparmor_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, apparmor_ptrace_traceme), LSM_HOOK_INIT(capget, apparmor_capget), LSM_HOOK_INIT(capable, apparmor_capable), LSM_HOOK_INIT(move_mount, apparmor_move_mount), LSM_HOOK_INIT(sb_mount, apparmor_sb_mount), LSM_HOOK_INIT(sb_umount, apparmor_sb_umount), LSM_HOOK_INIT(sb_pivotroot, apparmor_sb_pivotroot), LSM_HOOK_INIT(path_link, apparmor_path_link), LSM_HOOK_INIT(path_unlink, apparmor_path_unlink), LSM_HOOK_INIT(path_symlink, apparmor_path_symlink), LSM_HOOK_INIT(path_mkdir, apparmor_path_mkdir), LSM_HOOK_INIT(path_rmdir, apparmor_path_rmdir), LSM_HOOK_INIT(path_mknod, apparmor_path_mknod), LSM_HOOK_INIT(path_rename, apparmor_path_rename), LSM_HOOK_INIT(path_chmod, apparmor_path_chmod), LSM_HOOK_INIT(path_chown, apparmor_path_chown), LSM_HOOK_INIT(path_truncate, apparmor_path_truncate), LSM_HOOK_INIT(inode_getattr, apparmor_inode_getattr), LSM_HOOK_INIT(file_open, apparmor_file_open), LSM_HOOK_INIT(file_receive, apparmor_file_receive), LSM_HOOK_INIT(file_permission, apparmor_file_permission), LSM_HOOK_INIT(file_alloc_security, apparmor_file_alloc_security), LSM_HOOK_INIT(file_free_security, apparmor_file_free_security), LSM_HOOK_INIT(mmap_file, apparmor_mmap_file), LSM_HOOK_INIT(file_mprotect, apparmor_file_mprotect), LSM_HOOK_INIT(file_lock, apparmor_file_lock), LSM_HOOK_INIT(file_truncate, apparmor_file_truncate), LSM_HOOK_INIT(getselfattr, apparmor_getselfattr), LSM_HOOK_INIT(setselfattr, apparmor_setselfattr), LSM_HOOK_INIT(getprocattr, apparmor_getprocattr), LSM_HOOK_INIT(setprocattr, apparmor_setprocattr), LSM_HOOK_INIT(sk_alloc_security, apparmor_sk_alloc_security), LSM_HOOK_INIT(sk_free_security, apparmor_sk_free_security), LSM_HOOK_INIT(sk_clone_security, apparmor_sk_clone_security), LSM_HOOK_INIT(unix_stream_connect, apparmor_unix_stream_connect), LSM_HOOK_INIT(unix_may_send, apparmor_unix_may_send), LSM_HOOK_INIT(socket_create, apparmor_socket_create), LSM_HOOK_INIT(socket_post_create, apparmor_socket_post_create), LSM_HOOK_INIT(socket_socketpair, apparmor_socket_socketpair), LSM_HOOK_INIT(socket_bind, apparmor_socket_bind), LSM_HOOK_INIT(socket_connect, apparmor_socket_connect), LSM_HOOK_INIT(socket_listen, apparmor_socket_listen), LSM_HOOK_INIT(socket_accept, apparmor_socket_accept), LSM_HOOK_INIT(socket_sendmsg, apparmor_socket_sendmsg), LSM_HOOK_INIT(socket_recvmsg, apparmor_socket_recvmsg), LSM_HOOK_INIT(socket_getsockname, apparmor_socket_getsockname), LSM_HOOK_INIT(socket_getpeername, apparmor_socket_getpeername), LSM_HOOK_INIT(socket_getsockopt, apparmor_socket_getsockopt), LSM_HOOK_INIT(socket_setsockopt, apparmor_socket_setsockopt), LSM_HOOK_INIT(socket_shutdown, apparmor_socket_shutdown), #ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(socket_sock_rcv_skb, apparmor_socket_sock_rcv_skb), #endif LSM_HOOK_INIT(socket_getpeersec_stream, apparmor_socket_getpeersec_stream), LSM_HOOK_INIT(socket_getpeersec_dgram, apparmor_socket_getpeersec_dgram), LSM_HOOK_INIT(sock_graft, apparmor_sock_graft), #ifdef CONFIG_NETWORK_SECMARK LSM_HOOK_INIT(inet_conn_request, apparmor_inet_conn_request), #endif LSM_HOOK_INIT(cred_alloc_blank, apparmor_cred_alloc_blank), LSM_HOOK_INIT(cred_free, apparmor_cred_free), LSM_HOOK_INIT(cred_prepare, apparmor_cred_prepare), LSM_HOOK_INIT(cred_transfer, apparmor_cred_transfer), LSM_HOOK_INIT(bprm_creds_for_exec, apparmor_bprm_creds_for_exec), LSM_HOOK_INIT(bprm_committing_creds, apparmor_bprm_committing_creds), LSM_HOOK_INIT(bprm_committed_creds, apparmor_bprm_committed_creds), LSM_HOOK_INIT(task_free, apparmor_task_free), LSM_HOOK_INIT(task_alloc, apparmor_task_alloc), LSM_HOOK_INIT(current_getlsmprop_subj, apparmor_current_getlsmprop_subj), LSM_HOOK_INIT(task_getlsmprop_obj, apparmor_task_getlsmprop_obj), LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit), LSM_HOOK_INIT(task_kill, apparmor_task_kill), LSM_HOOK_INIT(userns_create, apparmor_userns_create), #ifdef CONFIG_AUDIT LSM_HOOK_INIT(audit_rule_init, aa_audit_rule_init), LSM_HOOK_INIT(audit_rule_known, aa_audit_rule_known), LSM_HOOK_INIT(audit_rule_match, aa_audit_rule_match), LSM_HOOK_INIT(audit_rule_free, aa_audit_rule_free), #endif LSM_HOOK_INIT(secid_to_secctx, apparmor_secid_to_secctx), LSM_HOOK_INIT(lsmprop_to_secctx, apparmor_lsmprop_to_secctx), LSM_HOOK_INIT(secctx_to_secid, apparmor_secctx_to_secid), LSM_HOOK_INIT(release_secctx, apparmor_release_secctx), #ifdef CONFIG_IO_URING LSM_HOOK_INIT(uring_override_creds, apparmor_uring_override_creds), LSM_HOOK_INIT(uring_sqpoll, apparmor_uring_sqpoll), #endif }; /* * AppArmor sysfs module parameters */ static int param_set_aabool(const char *val, const struct kernel_param *kp); static int param_get_aabool(char *buffer, const struct kernel_param *kp); #define param_check_aabool param_check_bool static const struct kernel_param_ops param_ops_aabool = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aabool, .get = param_get_aabool }; static int param_set_aauint(const char *val, const struct kernel_param *kp); static int param_get_aauint(char *buffer, const struct kernel_param *kp); #define param_check_aauint param_check_uint static const struct kernel_param_ops param_ops_aauint = { .set = param_set_aauint, .get = param_get_aauint }; static int param_set_aacompressionlevel(const char *val, const struct kernel_param *kp); static int param_get_aacompressionlevel(char *buffer, const struct kernel_param *kp); #define param_check_aacompressionlevel param_check_int static const struct kernel_param_ops param_ops_aacompressionlevel = { .set = param_set_aacompressionlevel, .get = param_get_aacompressionlevel }; static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp); static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp); #define param_check_aalockpolicy param_check_bool static const struct kernel_param_ops param_ops_aalockpolicy = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aalockpolicy, .get = param_get_aalockpolicy }; static int param_set_debug(const char *val, const struct kernel_param *kp); static int param_get_debug(char *buffer, const struct kernel_param *kp); static int param_set_audit(const char *val, const struct kernel_param *kp); static int param_get_audit(char *buffer, const struct kernel_param *kp); static int param_set_mode(const char *val, const struct kernel_param *kp); static int param_get_mode(char *buffer, const struct kernel_param *kp); /* Flag values, also controllable via /sys/module/apparmor/parameters * We define special types as we want to do additional mediation. */ /* AppArmor global enforcement switch - complain, enforce, kill */ enum profile_mode aa_g_profile_mode = APPARMOR_ENFORCE; module_param_call(mode, param_set_mode, param_get_mode, &aa_g_profile_mode, S_IRUSR | S_IWUSR); /* whether policy verification hashing is enabled */ bool aa_g_hash_policy = IS_ENABLED(CONFIG_SECURITY_APPARMOR_HASH_DEFAULT); #ifdef CONFIG_SECURITY_APPARMOR_HASH module_param_named(hash_policy, aa_g_hash_policy, aabool, S_IRUSR | S_IWUSR); #endif /* whether policy exactly as loaded is retained for debug and checkpointing */ bool aa_g_export_binary = IS_ENABLED(CONFIG_SECURITY_APPARMOR_EXPORT_BINARY); #ifdef CONFIG_SECURITY_APPARMOR_EXPORT_BINARY module_param_named(export_binary, aa_g_export_binary, aabool, 0600); #endif /* policy loaddata compression level */ int aa_g_rawdata_compression_level = AA_DEFAULT_CLEVEL; module_param_named(rawdata_compression_level, aa_g_rawdata_compression_level, aacompressionlevel, 0400); /* Debug mode */ int aa_g_debug; module_param_call(debug, param_set_debug, param_get_debug, &aa_g_debug, 0600); /* Audit mode */ enum audit_mode aa_g_audit; module_param_call(audit, param_set_audit, param_get_audit, &aa_g_audit, S_IRUSR | S_IWUSR); /* Determines if audit header is included in audited messages. This * provides more context if the audit daemon is not running */ bool aa_g_audit_header = true; module_param_named(audit_header, aa_g_audit_header, aabool, S_IRUSR | S_IWUSR); /* lock out loading/removal of policy * TODO: add in at boot loading of policy, which is the only way to * load policy, if lock_policy is set */ bool aa_g_lock_policy; module_param_named(lock_policy, aa_g_lock_policy, aalockpolicy, S_IRUSR | S_IWUSR); /* Syscall logging mode */ bool aa_g_logsyscall; module_param_named(logsyscall, aa_g_logsyscall, aabool, S_IRUSR | S_IWUSR); /* Maximum pathname length before accesses will start getting rejected */ unsigned int aa_g_path_max = 2 * PATH_MAX; module_param_named(path_max, aa_g_path_max, aauint, S_IRUSR); /* Determines how paranoid loading of policy is and how much verification * on the loaded policy is done. * DEPRECATED: read only as strict checking of load is always done now * that none root users (user namespaces) can load policy. */ bool aa_g_paranoid_load = IS_ENABLED(CONFIG_SECURITY_APPARMOR_PARANOID_LOAD); module_param_named(paranoid_load, aa_g_paranoid_load, aabool, S_IRUGO); static int param_get_aaintbool(char *buffer, const struct kernel_param *kp); static int param_set_aaintbool(const char *val, const struct kernel_param *kp); #define param_check_aaintbool param_check_int static const struct kernel_param_ops param_ops_aaintbool = { .set = param_set_aaintbool, .get = param_get_aaintbool }; /* Boot time disable flag */ static int apparmor_enabled __ro_after_init = 1; module_param_named(enabled, apparmor_enabled, aaintbool, 0444); static int __init apparmor_enabled_setup(char *str) { unsigned long enabled; int error = kstrtoul(str, 0, &enabled); if (!error) apparmor_enabled = enabled ? 1 : 0; return 1; } __setup("apparmor=", apparmor_enabled_setup); /* set global flag turning off the ability to load policy */ static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; return param_set_bool(val, kp); } static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_bool(buffer, kp); } static int param_set_aabool(const char *val, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; return param_set_bool(val, kp); } static int param_get_aabool(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_bool(buffer, kp); } static int param_set_aauint(const char *val, const struct kernel_param *kp) { int error; if (!apparmor_enabled) return -EINVAL; /* file is ro but enforce 2nd line check */ if (apparmor_initialized) return -EPERM; error = param_set_uint(val, kp); aa_g_path_max = max_t(uint32_t, aa_g_path_max, sizeof(union aa_buffer)); pr_info("AppArmor: buffer size set to %d bytes\n", aa_g_path_max); return error; } static int param_get_aauint(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_uint(buffer, kp); } /* Can only be set before AppArmor is initialized (i.e. on boot cmdline). */ static int param_set_aaintbool(const char *val, const struct kernel_param *kp) { struct kernel_param kp_local; bool value; int error; if (apparmor_initialized) return -EPERM; /* Create local copy, with arg pointing to bool type. */ value = !!*((int *)kp->arg); memcpy(&kp_local, kp, sizeof(kp_local)); kp_local.arg = &value; error = param_set_bool(val, &kp_local); if (!error) *((int *)kp->arg) = *((bool *)kp_local.arg); return error; } /* * To avoid changing /sys/module/apparmor/parameters/enabled from Y/N to * 1/0, this converts the "int that is actually bool" back to bool for * display in the /sys filesystem, while keeping it "int" for the LSM * infrastructure. */ static int param_get_aaintbool(char *buffer, const struct kernel_param *kp) { struct kernel_param kp_local; bool value; /* Create local copy, with arg pointing to bool type. */ value = !!*((int *)kp->arg); memcpy(&kp_local, kp, sizeof(kp_local)); kp_local.arg = &value; return param_get_bool(buffer, &kp_local); } static int param_set_aacompressionlevel(const char *val, const struct kernel_param *kp) { int error; if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized) return -EPERM; error = param_set_int(val, kp); aa_g_rawdata_compression_level = clamp(aa_g_rawdata_compression_level, AA_MIN_CLEVEL, AA_MAX_CLEVEL); pr_info("AppArmor: policy rawdata compression level set to %d\n", aa_g_rawdata_compression_level); return error; } static int param_get_aacompressionlevel(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return param_get_int(buffer, kp); } static int param_get_debug(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return aa_print_debug_params(buffer); } static int param_set_debug(const char *val, const struct kernel_param *kp) { int i; if (!apparmor_enabled) return -EINVAL; if (!val) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; i = aa_parse_debug_params(val); if (i == DEBUG_PARSE_ERROR) return -EINVAL; aa_g_debug = i; return 0; } static int param_get_audit(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return sprintf(buffer, "%s", audit_mode_names[aa_g_audit]); } static int param_set_audit(const char *val, const struct kernel_param *kp) { int i; if (!apparmor_enabled) return -EINVAL; if (!val) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; i = match_string(audit_mode_names, AUDIT_MAX_INDEX, val); if (i < 0) return -EINVAL; aa_g_audit = i; return 0; } static int param_get_mode(char *buffer, const struct kernel_param *kp) { if (!apparmor_enabled) return -EINVAL; if (apparmor_initialized && !aa_current_policy_view_capable(NULL)) return -EPERM; return sprintf(buffer, "%s", aa_profile_mode_names[aa_g_profile_mode]); } static int param_set_mode(const char *val, const struct kernel_param *kp) { int i; if (!apparmor_enabled) return -EINVAL; if (!val) return -EINVAL; if (apparmor_initialized && !aa_current_policy_admin_capable(NULL)) return -EPERM; i = match_string(aa_profile_mode_names, APPARMOR_MODE_NAMES_MAX_INDEX, val); if (i < 0) return -EINVAL; aa_g_profile_mode = i; return 0; } char *aa_get_buffer(bool in_atomic) { union aa_buffer *aa_buf; struct aa_local_cache *cache; bool try_again = true; gfp_t flags = (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); /* use per cpu cached buffers first */ cache = get_cpu_ptr(&aa_local_buffers); if (!list_empty(&cache->head)) { aa_buf = list_first_entry(&cache->head, union aa_buffer, list); list_del(&aa_buf->list); cache->hold--; cache->count--; put_cpu_ptr(&aa_local_buffers); return &aa_buf->buffer[0]; } put_cpu_ptr(&aa_local_buffers); if (!spin_trylock(&aa_buffers_lock)) { cache = get_cpu_ptr(&aa_local_buffers); cache->hold += 1; put_cpu_ptr(&aa_local_buffers); spin_lock(&aa_buffers_lock); } else { cache = get_cpu_ptr(&aa_local_buffers); put_cpu_ptr(&aa_local_buffers); } retry: if (buffer_count > reserve_count || (in_atomic && !list_empty(&aa_global_buffers))) { aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer, list); list_del(&aa_buf->list); buffer_count--; spin_unlock(&aa_buffers_lock); return aa_buf->buffer; } if (in_atomic) { /* * out of reserve buffers and in atomic context so increase * how many buffers to keep in reserve */ reserve_count++; flags = GFP_ATOMIC; } spin_unlock(&aa_buffers_lock); if (!in_atomic) might_sleep(); aa_buf = kmalloc(aa_g_path_max, flags); if (!aa_buf) { if (try_again) { try_again = false; spin_lock(&aa_buffers_lock); goto retry; } pr_warn_once("AppArmor: Failed to allocate a memory buffer.\n"); return NULL; } return aa_buf->buffer; } void aa_put_buffer(char *buf) { union aa_buffer *aa_buf; struct aa_local_cache *cache; if (!buf) return; aa_buf = container_of(buf, union aa_buffer, buffer[0]); cache = get_cpu_ptr(&aa_local_buffers); if (!cache->hold) { put_cpu_ptr(&aa_local_buffers); if (spin_trylock(&aa_buffers_lock)) { /* put back on global list */ list_add(&aa_buf->list, &aa_global_buffers); buffer_count++; spin_unlock(&aa_buffers_lock); cache = get_cpu_ptr(&aa_local_buffers); put_cpu_ptr(&aa_local_buffers); return; } /* contention on global list, fallback to percpu */ cache = get_cpu_ptr(&aa_local_buffers); cache->hold += 1; } /* cache in percpu list */ list_add(&aa_buf->list, &cache->head); cache->count++; put_cpu_ptr(&aa_local_buffers); } /* * AppArmor init functions */ /** * set_init_ctx - set a task context and profile on the first task. * * TODO: allow setting an alternate profile than unconfined */ static int __init set_init_ctx(void) { struct cred *cred = (__force struct cred *)current->real_cred; set_cred_label(cred, aa_get_label(ns_unconfined(root_ns))); return 0; } static void destroy_buffers(void) { union aa_buffer *aa_buf; spin_lock(&aa_buffers_lock); while (!list_empty(&aa_global_buffers)) { aa_buf = list_first_entry(&aa_global_buffers, union aa_buffer, list); list_del(&aa_buf->list); spin_unlock(&aa_buffers_lock); kfree(aa_buf); spin_lock(&aa_buffers_lock); } spin_unlock(&aa_buffers_lock); } static int __init alloc_buffers(void) { union aa_buffer *aa_buf; int i, num; /* * per cpu set of cached allocated buffers used to help reduce * lock contention */ for_each_possible_cpu(i) { per_cpu(aa_local_buffers, i).hold = 0; per_cpu(aa_local_buffers, i).count = 0; INIT_LIST_HEAD(&per_cpu(aa_local_buffers, i).head); } /* * A function may require two buffers at once. Usually the buffers are * used for a short period of time and are shared. On UP kernel buffers * two should be enough, with more CPUs it is possible that more * buffers will be used simultaneously. The preallocated pool may grow. * This preallocation has also the side-effect that AppArmor will be * disabled early at boot if aa_g_path_max is extremely high. */ if (num_online_cpus() > 1) num = 4 + RESERVE_COUNT; else num = 2 + RESERVE_COUNT; for (i = 0; i < num; i++) { aa_buf = kmalloc(aa_g_path_max, GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); if (!aa_buf) { destroy_buffers(); return -ENOMEM; } aa_put_buffer(aa_buf->buffer); } return 0; } #ifdef CONFIG_SYSCTL static int apparmor_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!aa_current_policy_admin_capable(NULL)) return -EPERM; if (!apparmor_enabled) return -EINVAL; return proc_dointvec(table, write, buffer, lenp, ppos); } static const struct ctl_table apparmor_sysctl_table[] = { #ifdef CONFIG_USER_NS { .procname = "unprivileged_userns_apparmor_policy", .data = &unprivileged_userns_apparmor_policy, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, #endif /* CONFIG_USER_NS */ { .procname = "apparmor_display_secid_mode", .data = &apparmor_display_secid_mode, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, { .procname = "apparmor_restrict_unprivileged_unconfined", .data = &aa_unprivileged_unconfined_restricted, .maxlen = sizeof(int), .mode = 0600, .proc_handler = apparmor_dointvec, }, }; static int __init apparmor_init_sysctl(void) { return register_sysctl("kernel", apparmor_sysctl_table) ? 0 : -ENOMEM; } #else static inline int apparmor_init_sysctl(void) { return 0; } #endif /* CONFIG_SYSCTL */ #if defined(CONFIG_NETFILTER) && defined(CONFIG_NETWORK_SECMARK) static unsigned int apparmor_ip_postroute(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct aa_sk_ctx *ctx; struct sock *sk; int error; if (!skb->secmark) return NF_ACCEPT; sk = skb_to_full_sk(skb); if (sk == NULL) return NF_ACCEPT; ctx = aa_sock(sk); rcu_read_lock(); error = apparmor_secmark_check(rcu_dereference(ctx->label), OP_SENDMSG, AA_MAY_SEND, skb->secmark, sk); rcu_read_unlock(); if (!error) return NF_ACCEPT; return NF_DROP_ERR(-ECONNREFUSED); } static const struct nf_hook_ops apparmor_nf_ops[] = { { .hook = apparmor_ip_postroute, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_SELINUX_FIRST, }, #if IS_ENABLED(CONFIG_IPV6) { .hook = apparmor_ip_postroute, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_SELINUX_FIRST, }, #endif }; static int __net_init apparmor_nf_register(struct net *net) { return nf_register_net_hooks(net, apparmor_nf_ops, ARRAY_SIZE(apparmor_nf_ops)); } static void __net_exit apparmor_nf_unregister(struct net *net) { nf_unregister_net_hooks(net, apparmor_nf_ops, ARRAY_SIZE(apparmor_nf_ops)); } static struct pernet_operations apparmor_net_ops = { .init = apparmor_nf_register, .exit = apparmor_nf_unregister, }; static int __init apparmor_nf_ip_init(void) { int err; if (!apparmor_enabled) return 0; err = register_pernet_subsys(&apparmor_net_ops); if (err) panic("Apparmor: register_pernet_subsys: error %d\n", err); return 0; } #endif static char nulldfa_src[] __aligned(8) = { #include "nulldfa.in" }; static struct aa_dfa *nulldfa; static char stacksplitdfa_src[] __aligned(8) = { #include "stacksplitdfa.in" }; struct aa_dfa *stacksplitdfa; struct aa_policydb *nullpdb; static int __init aa_setup_dfa_engine(void) { int error = -ENOMEM; nullpdb = aa_alloc_pdb(GFP_KERNEL); if (!nullpdb) return -ENOMEM; nulldfa = aa_dfa_unpack(nulldfa_src, sizeof(nulldfa_src), TO_ACCEPT1_FLAG(YYTD_DATA32) | TO_ACCEPT2_FLAG(YYTD_DATA32)); if (IS_ERR(nulldfa)) { error = PTR_ERR(nulldfa); goto fail; } nullpdb->dfa = aa_get_dfa(nulldfa); nullpdb->perms = kcalloc(2, sizeof(struct aa_perms), GFP_KERNEL); if (!nullpdb->perms) goto fail; nullpdb->size = 2; stacksplitdfa = aa_dfa_unpack(stacksplitdfa_src, sizeof(stacksplitdfa_src), TO_ACCEPT1_FLAG(YYTD_DATA32) | TO_ACCEPT2_FLAG(YYTD_DATA32)); if (IS_ERR(stacksplitdfa)) { error = PTR_ERR(stacksplitdfa); goto fail; } return 0; fail: aa_put_pdb(nullpdb); aa_put_dfa(nulldfa); nullpdb = NULL; nulldfa = NULL; stacksplitdfa = NULL; return error; } static void __init aa_teardown_dfa_engine(void) { aa_put_dfa(stacksplitdfa); aa_put_dfa(nulldfa); aa_put_pdb(nullpdb); nullpdb = NULL; stacksplitdfa = NULL; nulldfa = NULL; } static int __init apparmor_init(void) { int error; error = aa_setup_dfa_engine(); if (error) { AA_ERROR("Unable to setup dfa engine\n"); goto alloc_out; } error = aa_alloc_root_ns(); if (error) { AA_ERROR("Unable to allocate default profile namespace\n"); goto alloc_out; } error = apparmor_init_sysctl(); if (error) { AA_ERROR("Unable to register sysctls\n"); goto alloc_out; } error = alloc_buffers(); if (error) { AA_ERROR("Unable to allocate work buffers\n"); goto alloc_out; } error = set_init_ctx(); if (error) { AA_ERROR("Failed to set context on init task\n"); aa_free_root_ns(); goto buffers_out; } security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), &apparmor_lsmid); /* Inform the audit system that secctx is used */ audit_cfg_lsm(&apparmor_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; if (aa_g_profile_mode == APPARMOR_COMPLAIN) aa_info_message("AppArmor initialized: complain mode enabled"); else if (aa_g_profile_mode == APPARMOR_KILL) aa_info_message("AppArmor initialized: kill mode enabled"); else aa_info_message("AppArmor initialized"); return error; buffers_out: destroy_buffers(); alloc_out: aa_destroy_aafs(); aa_teardown_dfa_engine(); apparmor_enabled = false; return error; } DEFINE_LSM(apparmor) = { .id = &apparmor_lsmid, .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE, .enabled = &apparmor_enabled, .blobs = &apparmor_blob_sizes, .init = apparmor_init, .initcall_fs = aa_create_aafs, #if defined(CONFIG_NETFILTER) && defined(CONFIG_NETWORK_SECMARK) .initcall_device = apparmor_nf_ip_init, #endif #ifdef CONFIG_SECURITY_APPARMOR_HASH .initcall_late = init_profile_hash, #endif };
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 /******************************************************************************* * * Module Name: utxferror - Various error/warning output functions * ******************************************************************************/ #define EXPORT_ACPI_INTERFACES #include <acpi/acpi.h> #include "accommon.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utxferror") /* * This module is used for the in-kernel ACPICA as well as the ACPICA * tools/applications. */ #ifndef ACPI_NO_ERROR_MESSAGES /* Entire module */ /******************************************************************************* * * FUNCTION: acpi_error * * PARAMETERS: module_name - Caller's module name (for error output) * line_number - Caller's line number (for error output) * format - Printf format string + additional args * * RETURN: None * * DESCRIPTION: Print "ACPI Error" message with module/line/version info * ******************************************************************************/ void ACPI_INTERNAL_VAR_XFACE acpi_error(const char *module_name, u32 line_number, const char *format, ...) { va_list arg_list; ACPI_MSG_REDIRECT_BEGIN; acpi_os_printf(ACPI_MSG_ERROR); va_start(arg_list, format); acpi_os_vprintf(format, arg_list); ACPI_MSG_SUFFIX; va_end(arg_list); ACPI_MSG_REDIRECT_END; } ACPI_EXPORT_SYMBOL(acpi_error) /******************************************************************************* * * FUNCTION: acpi_exception * * PARAMETERS: module_name - Caller's module name (for error output) * line_number - Caller's line number (for error output) * status - Status value to be decoded/formatted * format - Printf format string + additional args * * RETURN: None * * DESCRIPTION: Print an "ACPI Error" message with module/line/version * info as well as decoded acpi_status. * ******************************************************************************/ void ACPI_INTERNAL_VAR_XFACE acpi_exception(const char *module_name, u32 line_number, acpi_status status, const char *format, ...) { va_list arg_list; ACPI_MSG_REDIRECT_BEGIN; /* For AE_OK, just print the message */ if (ACPI_SUCCESS(status)) { acpi_os_printf(ACPI_MSG_ERROR); } else { acpi_os_printf(ACPI_MSG_ERROR "%s, ", acpi_format_exception(status)); } va_start(arg_list, format); acpi_os_vprintf(format, arg_list); ACPI_MSG_SUFFIX; va_end(arg_list); ACPI_MSG_REDIRECT_END; } ACPI_EXPORT_SYMBOL(acpi_exception) /******************************************************************************* * * FUNCTION: acpi_warning * * PARAMETERS: module_name - Caller's module name (for warning output) * line_number - Caller's line number (for warning output) * format - Printf format string + additional args * * RETURN: None * * DESCRIPTION: Print "ACPI Warning" message with module/line/version info * ******************************************************************************/ void ACPI_INTERNAL_VAR_XFACE acpi_warning(const char *module_name, u32 line_number, const char *format, ...) { va_list arg_list; ACPI_MSG_REDIRECT_BEGIN; acpi_os_printf(ACPI_MSG_WARNING); va_start(arg_list, format); acpi_os_vprintf(format, arg_list); ACPI_MSG_SUFFIX; va_end(arg_list); ACPI_MSG_REDIRECT_END; } ACPI_EXPORT_SYMBOL(acpi_warning) /******************************************************************************* * * FUNCTION: acpi_info * * PARAMETERS: format - Printf format string + additional args * * RETURN: None * * DESCRIPTION: Print generic "ACPI:" information message. There is no * module/line/version info in order to keep the message simple. * ******************************************************************************/ void ACPI_INTERNAL_VAR_XFACE acpi_info(const char *format, ...) { va_list arg_list; ACPI_MSG_REDIRECT_BEGIN; acpi_os_printf(ACPI_MSG_INFO); va_start(arg_list, format); acpi_os_vprintf(format, arg_list); acpi_os_printf("\n"); va_end(arg_list); ACPI_MSG_REDIRECT_END; } ACPI_EXPORT_SYMBOL(acpi_info) /******************************************************************************* * * FUNCTION: acpi_bios_error * * PARAMETERS: module_name - Caller's module name (for error output) * line_number - Caller's line number (for error output) * format - Printf format string + additional args * * RETURN: None * * DESCRIPTION: Print "ACPI Firmware Error" message with module/line/version * info * ******************************************************************************/ void ACPI_INTERNAL_VAR_XFACE acpi_bios_error(const char *module_name, u32 line_number, const char *format, ...) { va_list arg_list; ACPI_MSG_REDIRECT_BEGIN; acpi_os_printf(ACPI_MSG_BIOS_ERROR); va_start(arg_list, format); acpi_os_vprintf(format, arg_list); ACPI_MSG_SUFFIX; va_end(arg_list); ACPI_MSG_REDIRECT_END; } ACPI_EXPORT_SYMBOL(acpi_bios_error) /******************************************************************************* * * FUNCTION: acpi_bios_exception * * PARAMETERS: module_name - Caller's module name (for error output) * line_number - Caller's line number (for error output) * status - Status value to be decoded/formatted * format - Printf format string + additional args * * RETURN: None * * DESCRIPTION: Print an "ACPI Firmware Error" message with module/line/version * info as well as decoded acpi_status. * ******************************************************************************/ void ACPI_INTERNAL_VAR_XFACE acpi_bios_exception(const char *module_name, u32 line_number, acpi_status status, const char *format, ...) { va_list arg_list; ACPI_MSG_REDIRECT_BEGIN; /* For AE_OK, just print the message */ if (ACPI_SUCCESS(status)) { acpi_os_printf(ACPI_MSG_BIOS_ERROR); } else { acpi_os_printf(ACPI_MSG_BIOS_ERROR "%s, ", acpi_format_exception(status)); } va_start(arg_list, format); acpi_os_vprintf(format, arg_list); ACPI_MSG_SUFFIX; va_end(arg_list); ACPI_MSG_REDIRECT_END; } ACPI_EXPORT_SYMBOL(acpi_bios_exception) /******************************************************************************* * * FUNCTION: acpi_bios_warning * * PARAMETERS: module_name - Caller's module name (for warning output) * line_number - Caller's line number (for warning output) * format - Printf format string + additional args * * RETURN: None * * DESCRIPTION: Print "ACPI Firmware Warning" message with module/line/version * info * ******************************************************************************/ void ACPI_INTERNAL_VAR_XFACE acpi_bios_warning(const char *module_name, u32 line_number, const char *format, ...) { va_list arg_list; ACPI_MSG_REDIRECT_BEGIN; acpi_os_printf(ACPI_MSG_BIOS_WARNING); va_start(arg_list, format); acpi_os_vprintf(format, arg_list); ACPI_MSG_SUFFIX; va_end(arg_list); ACPI_MSG_REDIRECT_END; } ACPI_EXPORT_SYMBOL(acpi_bios_warning) #endif /* ACPI_NO_ERROR_MESSAGES */
30 30 102 27 103 104 4 4 104 101 101 101 3 3 113 112 113 112 84 83 80 79 80 2 112 113 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/memory.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include <linux/hash.h> #include <linux/slab.h> #include "common.h" /** * tomoyo_warn_oom - Print out of memory warning message. * * @function: Function's name. */ void tomoyo_warn_oom(const char *function) { /* Reduce error messages. */ static pid_t tomoyo_last_pid; const pid_t pid = current->pid; if (tomoyo_last_pid != pid) { pr_warn("ERROR: Out of memory at %s.\n", function); tomoyo_last_pid = pid; } if (!tomoyo_policy_loaded) panic("MAC Initialization failed.\n"); } /* Memoy currently used by policy/audit log/query. */ unsigned int tomoyo_memory_used[TOMOYO_MAX_MEMORY_STAT]; /* Memory quota for "policy"/"audit log"/"query". */ unsigned int tomoyo_memory_quota[TOMOYO_MAX_MEMORY_STAT]; /** * tomoyo_memory_ok - Check memory quota. * * @ptr: Pointer to allocated memory. * * Returns true on success, false otherwise. * * Returns true if @ptr is not NULL and quota not exceeded, false otherwise. * * Caller holds tomoyo_policy_lock mutex. */ bool tomoyo_memory_ok(void *ptr) { if (ptr) { const size_t s = ksize(ptr); tomoyo_memory_used[TOMOYO_MEMORY_POLICY] += s; if (!tomoyo_memory_quota[TOMOYO_MEMORY_POLICY] || tomoyo_memory_used[TOMOYO_MEMORY_POLICY] <= tomoyo_memory_quota[TOMOYO_MEMORY_POLICY]) return true; tomoyo_memory_used[TOMOYO_MEMORY_POLICY] -= s; } tomoyo_warn_oom(__func__); return false; } /** * tomoyo_commit_ok - Check memory quota. * * @data: Data to copy from. * @size: Size in byte. * * Returns pointer to allocated memory on success, NULL otherwise. * @data is zero-cleared on success. * * Caller holds tomoyo_policy_lock mutex. */ void *tomoyo_commit_ok(void *data, const unsigned int size) { void *ptr = kzalloc(size, GFP_NOFS | __GFP_NOWARN); if (tomoyo_memory_ok(ptr)) { memmove(ptr, data, size); memset(data, 0, size); return ptr; } kfree(ptr); return NULL; } /** * tomoyo_get_group - Allocate memory for "struct tomoyo_path_group"/"struct tomoyo_number_group". * * @param: Pointer to "struct tomoyo_acl_param". * @idx: Index number. * * Returns pointer to "struct tomoyo_group" on success, NULL otherwise. */ struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param, const u8 idx) { struct tomoyo_group e = { }; struct tomoyo_group *group = NULL; struct list_head *list; const char *group_name = tomoyo_read_token(param); bool found = false; if (!tomoyo_correct_word(group_name) || idx >= TOMOYO_MAX_GROUP) return NULL; e.group_name = tomoyo_get_name(group_name); if (!e.group_name) return NULL; if (mutex_lock_interruptible(&tomoyo_policy_lock)) goto out; list = &param->ns->group_list[idx]; list_for_each_entry(group, list, head.list) { if (e.group_name != group->group_name || atomic_read(&group->head.users) == TOMOYO_GC_IN_PROGRESS) continue; atomic_inc(&group->head.users); found = true; break; } if (!found) { struct tomoyo_group *entry = tomoyo_commit_ok(&e, sizeof(e)); if (entry) { INIT_LIST_HEAD(&entry->member_list); atomic_set(&entry->head.users, 1); list_add_tail_rcu(&entry->head.list, list); group = entry; found = true; } } mutex_unlock(&tomoyo_policy_lock); out: tomoyo_put_name(e.group_name); return found ? group : NULL; } /* * tomoyo_name_list is used for holding string data used by TOMOYO. * Since same string data is likely used for multiple times (e.g. * "/lib/libc-2.5.so"), TOMOYO shares string data in the form of * "const struct tomoyo_path_info *". */ struct list_head tomoyo_name_list[TOMOYO_MAX_HASH]; /** * tomoyo_get_name - Allocate permanent memory for string data. * * @name: The string to store into the permernent memory. * * Returns pointer to "struct tomoyo_path_info" on success, NULL otherwise. */ const struct tomoyo_path_info *tomoyo_get_name(const char *name) { struct tomoyo_name *ptr; unsigned int hash; int len; struct list_head *head; if (!name) return NULL; len = strlen(name) + 1; hash = full_name_hash(NULL, (const unsigned char *) name, len - 1); head = &tomoyo_name_list[hash_long(hash, TOMOYO_HASH_BITS)]; if (mutex_lock_interruptible(&tomoyo_policy_lock)) return NULL; list_for_each_entry(ptr, head, head.list) { if (hash != ptr->entry.hash || strcmp(name, ptr->entry.name) || atomic_read(&ptr->head.users) == TOMOYO_GC_IN_PROGRESS) continue; atomic_inc(&ptr->head.users); goto out; } ptr = kzalloc(sizeof(*ptr) + len, GFP_NOFS | __GFP_NOWARN); if (tomoyo_memory_ok(ptr)) { ptr->entry.name = ((char *) ptr) + sizeof(*ptr); memmove((char *) ptr->entry.name, name, len); atomic_set(&ptr->head.users, 1); tomoyo_fill_path_info(&ptr->entry); list_add_tail(&ptr->head.list, head); } else { kfree(ptr); ptr = NULL; } out: mutex_unlock(&tomoyo_policy_lock); return ptr ? &ptr->entry : NULL; } /* Initial namespace.*/ struct tomoyo_policy_namespace tomoyo_kernel_namespace; /** * tomoyo_mm_init - Initialize mm related code. */ void __init tomoyo_mm_init(void) { int idx; for (idx = 0; idx < TOMOYO_MAX_HASH; idx++) INIT_LIST_HEAD(&tomoyo_name_list[idx]); tomoyo_kernel_namespace.name = "<kernel>"; tomoyo_init_policy_namespace(&tomoyo_kernel_namespace); tomoyo_kernel_domain.ns = &tomoyo_kernel_namespace; INIT_LIST_HEAD(&tomoyo_kernel_domain.acl_info_list); tomoyo_kernel_domain.domainname = tomoyo_get_name("<kernel>"); list_add_tail_rcu(&tomoyo_kernel_domain.list, &tomoyo_domain_list); }
16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 /* SPDX-License-Identifier: GPL-2.0 */ /* * RT Mutexes: blocking mutual exclusion locks with PI support * * started by Ingo Molnar and Thomas Gleixner: * * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> * * This file contains the public data structure and API definitions. */ #ifndef __LINUX_RT_MUTEX_H #define __LINUX_RT_MUTEX_H #include <linux/compiler.h> #include <linux/linkage.h> #include <linux/rbtree_types.h> #include <linux/spinlock_types_raw.h> extern int max_lock_depth; struct rt_mutex_base { raw_spinlock_t wait_lock; struct rb_root_cached waiters; struct task_struct *owner; }; #define __RT_MUTEX_BASE_INITIALIZER(rtbasename) \ { \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(rtbasename.wait_lock), \ .waiters = RB_ROOT_CACHED, \ .owner = NULL \ } /** * rt_mutex_base_is_locked - is the rtmutex locked * @lock: the mutex to be queried * * Returns true if the mutex is locked, false if unlocked. */ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) { return READ_ONCE(lock->owner) != NULL; } #ifdef CONFIG_RT_MUTEXES #define RT_MUTEX_HAS_WAITERS 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) { unsigned long owner = (unsigned long) READ_ONCE(lock->owner); return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } #endif extern void rt_mutex_base_init(struct rt_mutex_base *rtb); /** * The rt_mutex structure * * @wait_lock: spinlock to protect the structure * @waiters: rbtree root to enqueue waiters in priority order; * caches top-waiter (leftmost node). * @owner: the mutex owner */ struct rt_mutex { struct rt_mutex_base rtmutex; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; struct rt_mutex_waiter; struct hrtimer_sleeper; #ifdef CONFIG_DEBUG_RT_MUTEXES extern void rt_mutex_debug_task_free(struct task_struct *tsk); #else static inline void rt_mutex_debug_task_free(struct task_struct *tsk) { } #endif #define rt_mutex_init(mutex) \ do { \ static struct lock_class_key __key; \ __rt_mutex_init(mutex, __func__, &__key); \ } while (0) #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \ .dep_map = { \ .name = #mutexname, \ .wait_type_inner = LD_WAIT_SLEEP, \ } #else #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) #endif #define __RT_MUTEX_INITIALIZER(mutexname) \ { \ .rtmutex = __RT_MUTEX_BASE_INITIALIZER(mutexname.rtmutex), \ __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \ } #define DEFINE_RT_MUTEX(mutexname) \ struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname) extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key); #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass); extern void _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock); #define rt_mutex_lock(lock) rt_mutex_lock_nested(lock, 0) #define rt_mutex_lock_nest_lock(lock, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ _rt_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ } while (0) #else extern void rt_mutex_lock(struct rt_mutex *lock); #define rt_mutex_lock_nested(lock, subclass) rt_mutex_lock(lock) #define rt_mutex_lock_nest_lock(lock, nest_lock) rt_mutex_lock(lock) #endif extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); extern int rt_mutex_lock_killable(struct rt_mutex *lock); extern int rt_mutex_trylock(struct rt_mutex *lock); extern void rt_mutex_unlock(struct rt_mutex *lock); #endif
17 16 16 15 15 15 16 16 3 16 90 91 89 89 91 16 16 16 16 16 16 15 3 16 90 68 62 413 35 413 63 411 332 348 332 8 32 32 8 8 32 42 34 41 18 42 26 32 42 20 42 35 35 35 10 35 32 25 20 20 20 20 11 11 11 11 11 20 15 56 56 56 55 56 54 55 2611 2608 100 65 100 100 87 85 87 87 87 88 96 10 4 2 10 10 97 72 12 3 12 12 97 17 95 94 88 4 17 17 17 17 17 17 17 17 17 2436 2428 3 2426 57 57 56 56 55 10 55 51 2 56 56 10 56 56 56 56 56 41 7 41 41 27 41 41 41 41 41 41 41 39 11 41 41 5 66 66 42 42 41 41 41 7 41 35 41 41 41 41 41 41 7 41 41 41 42 141 141 7 141 7 7 7 37 37 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 // SPDX-License-Identifier: GPL-2.0-only /* * mm/truncate.c - code for taking down pages from address_spaces * * Copyright (C) 2002, Linus Torvalds * * 10Sep2002 Andrew Morton * Initial version. */ #include <linux/kernel.h> #include <linux/backing-dev.h> #include <linux/dax.h> #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/export.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> #include "internal.h" static void clear_shadow_entries(struct address_space *mapping, unsigned long start, unsigned long max) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; /* Handled by shmem itself, or for DAX we do nothing. */ if (shmem_mapping(mapping) || dax_mapping(mapping)) return; xas_set_update(&xas, workingset_update_node); spin_lock(&mapping->host->i_lock); xas_lock_irq(&xas); /* Clear all shadow entries from start to max */ xas_for_each(&xas, folio, max) { if (xa_is_value(folio)) xas_store(&xas, NULL); } xas_unlock_irq(&xas); if (mapping_shrinkable(mapping)) inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); } /* * Unconditionally remove exceptional entries. Usually called from truncate * path. Note that the folio_batch may be altered by this function by removing * exceptional entries similar to what folio_batch_remove_exceptionals() does. * Please note that indices[] has entries in ascending order as guaranteed by * either find_get_entries() or find_lock_entries(). */ static void truncate_folio_batch_exceptionals(struct address_space *mapping, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, indices[0]); int nr = folio_batch_count(fbatch); struct folio *folio; int i, j; /* Handled by shmem itself */ if (shmem_mapping(mapping)) return; for (j = 0; j < nr; j++) if (xa_is_value(fbatch->folios[j])) break; if (j == nr) return; if (dax_mapping(mapping)) { for (i = j; i < nr; i++) { if (xa_is_value(fbatch->folios[i])) { /* * File systems should already have called * dax_break_layout_entry() to remove all DAX * entries while holding a lock to prevent * establishing new entries. Therefore we * shouldn't find any here. */ WARN_ON_ONCE(1); /* * Delete the mapping so truncate_pagecache() * doesn't loop forever. */ dax_delete_mapping_entry(mapping, indices[i]); } } goto out; } xas_set(&xas, indices[j]); xas_set_update(&xas, workingset_update_node); spin_lock(&mapping->host->i_lock); xas_lock_irq(&xas); xas_for_each(&xas, folio, indices[nr-1]) { if (xa_is_value(folio)) xas_store(&xas, NULL); } xas_unlock_irq(&xas); if (mapping_shrinkable(mapping)) inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); out: folio_batch_remove_exceptionals(fbatch); } /** * folio_invalidate - Invalidate part or all of a folio. * @folio: The folio which is affected. * @offset: start of the range to invalidate * @length: length of the range to invalidate * * folio_invalidate() is called when all or part of the folio has become * invalidated by a truncate operation. * * folio_invalidate() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O * is underway against any of the blocks which are outside the truncation * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ void folio_invalidate(struct folio *folio, size_t offset, size_t length) { const struct address_space_operations *aops = folio->mapping->a_ops; if (aops->invalidate_folio) aops->invalidate_folio(folio, offset, length); } EXPORT_SYMBOL_GPL(folio_invalidate); /* * If truncate cannot remove the fs-private metadata from the page, the page * becomes orphaned. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_fault(). * * We need to bail out if page->mapping is no longer equal to the original * mapping. This happens a) when the VM reclaimed the page while we waited on * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ static void truncate_cleanup_folio(struct folio *folio) { if (folio_mapped(folio)) unmap_mapping_folio(folio); if (folio_needs_release(folio)) folio_invalidate(folio, 0, folio_size(folio)); /* * Some filesystems seem to re-dirty the page even after * the VM has canceled the dirty bit (eg ext3 journaling). * Hence dirty accounting check is placed after invalidation. */ folio_cancel_dirty(folio); } int truncate_inode_folio(struct address_space *mapping, struct folio *folio) { if (folio->mapping != mapping) return -EIO; truncate_cleanup_folio(folio); filemap_remove_folio(folio); return 0; } static int try_folio_split_or_unmap(struct folio *folio, struct page *split_at, unsigned long min_order) { enum ttu_flags ttu_flags = TTU_SYNC | TTU_SPLIT_HUGE_PMD | TTU_IGNORE_MLOCK; int ret; ret = try_folio_split_to_order(folio, split_at, min_order); /* * If the split fails, unmap the folio, so it will be refaulted * with PTEs to respect SIGBUS semantics. * * Make an exception for shmem/tmpfs that for long time * intentionally mapped with PMDs across i_size. */ if (ret && !shmem_mapping(folio->mapping)) { try_to_unmap(folio, ttu_flags); WARN_ON(folio_mapped(folio)); } return ret; } /* * Handle partial folios. The folio may be entirely within the * range if a split has raced with us. If not, we zero the part of the * folio that's within the [start, end] range, and then split the folio if * it's large. split_page_range() will discard pages which now lie beyond * i_size, and we rely on the caller to discard pages which lie within a * newly created hole. * * Returns false if splitting failed so the caller can avoid * discarding the entire folio which is stubbornly unsplit. */ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) { loff_t pos = folio_pos(folio); size_t size = folio_size(folio); unsigned int offset, length; struct page *split_at, *split_at2; unsigned int min_order; if (pos < start) offset = start - pos; else offset = 0; if (pos + size <= (u64)end) length = size - offset; else length = end + 1 - pos - offset; folio_wait_writeback(folio); if (length == size) { truncate_inode_folio(folio->mapping, folio); return true; } /* * We may be zeroing pages we're about to discard, but it avoids * doing a complex calculation here, and then doing the zeroing * anyway if the page split fails. */ if (!mapping_inaccessible(folio->mapping)) folio_zero_range(folio, offset, length); if (folio_needs_release(folio)) folio_invalidate(folio, offset, length); if (!folio_test_large(folio)) return true; min_order = mapping_min_folio_order(folio->mapping); split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE); if (!try_folio_split_or_unmap(folio, split_at, min_order)) { /* * try to split at offset + length to make sure folios within * the range can be dropped, especially to avoid memory waste * for shmem truncate */ struct folio *folio2; if (offset + length == size) goto no_split; split_at2 = folio_page(folio, PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE); folio2 = page_folio(split_at2); if (!folio_try_get(folio2)) goto no_split; if (!folio_test_large(folio2)) goto out; if (!folio_trylock(folio2)) goto out; /* make sure folio2 is large and does not change its mapping */ if (folio_test_large(folio2) && folio2->mapping == folio->mapping) try_folio_split_or_unmap(folio2, split_at2, min_order); folio_unlock(folio2); out: folio_put(folio2); no_split: return true; } if (folio_test_dirty(folio)) return false; truncate_inode_folio(folio->mapping, folio); return true; } /* * Used to get rid of pages on hardware memory corruption. */ int generic_error_remove_folio(struct address_space *mapping, struct folio *folio) { if (!mapping) return -EINVAL; /* * Only punch for normal data pages for now. * Handling other types like directories would need more auditing. */ if (!S_ISREG(mapping->host->i_mode)) return -EIO; return truncate_inode_folio(mapping, folio); } EXPORT_SYMBOL(generic_error_remove_folio); /** * mapping_evict_folio() - Remove an unused folio from the page-cache. * @mapping: The mapping this folio belongs to. * @folio: The folio to remove. * * Safely remove one folio from the page cache. * It only drops clean, unused folios. * * Context: Folio must be locked. * Return: The number of pages successfully removed. */ long mapping_evict_folio(struct address_space *mapping, struct folio *folio) { /* The page may have been truncated before it was locked */ if (!mapping) return 0; if (folio_test_dirty(folio) || folio_test_writeback(folio)) return 0; /* The refcount will be elevated if any page in the folio is mapped */ if (folio_ref_count(folio) > folio_nr_pages(folio) + folio_has_private(folio) + 1) return 0; if (!filemap_release_folio(folio, 0)) return 0; return remove_mapping(mapping, folio); } /** * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate * @lend: offset to which to truncate (inclusive) * * Truncate the page cache, removing the pages that are between * specified offsets (and zeroing out partial pages * if lstart or lend + 1 is not page aligned). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass * will wait. This is to prevent as much IO as possible in the affected region. * The first pass will remove most pages, so the search cost of the second pass * is low. * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * * Note that since ->invalidate_folio() accepts range to invalidate * truncate_inode_pages_range is able to handle cases where lend + 1 is not * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, uoff_t lend) { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; struct folio *folio; bool same_folio; if (mapping_empty(mapping)) return; /* * 'start' and 'end' always covers the range of pages to be fully * truncated. Partial pages are covered with 'partial_start' at the * start of the range and 'partial_end' at the end of the range. * Note that 'end' is exclusive while 'lend' is inclusive. */ start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; if (lend == -1) /* * lend == -1 indicates end-of-file so we have to set 'end' * to the highest possible pgoff_t and since the type is * unsigned we're using -1. */ end = -1; else end = (lend + 1) >> PAGE_SHIFT; folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { truncate_folio_batch_exceptionals(mapping, &fbatch, indices); for (i = 0; i < folio_batch_count(&fbatch); i++) truncate_cleanup_folio(fbatch.folios[i]); delete_from_page_cache_batch(mapping, &fbatch); for (i = 0; i < folio_batch_count(&fbatch); i++) folio_unlock(fbatch.folios[i]); folio_batch_release(&fbatch); cond_resched(); } same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); if (!IS_ERR(folio)) { same_folio = lend < folio_next_pos(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); if (same_folio) end = folio->index; } folio_unlock(folio); folio_put(folio); folio = NULL; } if (!same_folio) { folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT, FGP_LOCK, 0); if (!IS_ERR(folio)) { if (!truncate_inode_partial_folio(folio, lstart, lend)) end = folio->index; folio_unlock(folio); folio_put(folio); } } index = start; while (index < end) { cond_resched(); if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone from start onwards, we're done */ if (index == start) break; /* Otherwise restart to make sure all gone */ index = start; continue; } for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ if (xa_is_value(folio)) continue; folio_lock(folio); VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); truncate_inode_folio(mapping, folio); folio_unlock(folio); } truncate_folio_batch_exceptionals(mapping, &fbatch, indices); folio_batch_release(&fbatch); } } EXPORT_SYMBOL(truncate_inode_pages_range); /** * truncate_inode_pages - truncate *all* the pages from an offset * @mapping: mapping to truncate * @lstart: offset from which to truncate * * Called under (and serialised by) inode->i_rwsem and * mapping->invalidate_lock. * * Note: When this function returns, there can be a page in the process of * deletion (inside __filemap_remove_folio()) in the specified range. Thus * mapping->nrpages can be non-zero when this function returns even after * truncation of the whole mapping. */ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) { truncate_inode_pages_range(mapping, lstart, (loff_t)-1); } EXPORT_SYMBOL(truncate_inode_pages); /** * truncate_inode_pages_final - truncate *all* pages before inode dies * @mapping: mapping to truncate * * Called under (and serialized by) inode->i_rwsem. * * Filesystems have to use this in the .evict_inode path to inform the * VM that this is the final truncate and the inode is going away. */ void truncate_inode_pages_final(struct address_space *mapping) { /* * Page reclaim can not participate in regular inode lifetime * management (can't call iput()) and thus can race with the * inode teardown. Tell it when the address space is exiting, * so that it does not install eviction information after the * final truncate has begun. */ mapping_set_exiting(mapping); if (!mapping_empty(mapping)) { /* * As truncation uses a lockless tree lookup, cycle * the tree lock to make sure any ongoing tree * modification that does not see AS_EXITING is * completed before starting the final truncate. */ xa_lock_irq(&mapping->i_pages); xa_unlock_irq(&mapping->i_pages); } truncate_inode_pages(mapping, 0); } EXPORT_SYMBOL(truncate_inode_pages_final); /** * mapping_try_invalidate - Invalidate all the evictable folios of one inode * @mapping: the address_space which holds the folios to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * @nr_failed: How many folio invalidations failed * * This function is similar to invalidate_mapping_pages(), except that it * returns the number of folios which could not be evicted in @nr_failed. */ unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed) { pgoff_t indices[PAGEVEC_SIZE]; struct folio_batch fbatch; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; folio_batch_init(&fbatch); while (find_lock_entries(mapping, &index, end, &fbatch, indices)) { bool xa_has_values = false; int nr = folio_batch_count(&fbatch); for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ if (xa_is_value(folio)) { xa_has_values = true; count++; continue; } ret = mapping_evict_folio(mapping, folio); folio_unlock(folio); /* * Invalidation is a hint that the folio is no longer * of interest and try to speed up its reclaim. */ if (!ret) { deactivate_file_folio(folio); /* Likely in the lru cache of a remote CPU */ if (nr_failed) (*nr_failed)++; } count += ret; } if (xa_has_values) clear_shadow_entries(mapping, indices[0], indices[nr-1]); folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); } return count; } /** * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode * @mapping: the address_space which holds the cache to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * * This function removes pages that are clean, unmapped and unlocked, * as well as shadow entries. It will not block on IO activity. * * If you want to remove all the pages of one inode, regardless of * their use and writeback state, use truncate_inode_pages(). * * Return: The number of indices that had their contents invalidated */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { return mapping_try_invalidate(mapping, start, end, NULL); } EXPORT_SYMBOL(invalidate_mapping_pages); static int folio_launder(struct address_space *mapping, struct folio *folio) { if (!folio_test_dirty(folio)) return 0; if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) return 0; return mapping->a_ops->launder_folio(folio); } /* * This is like mapping_evict_folio(), except it ignores the folio's * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave folios behind because * shrink_folio_list() has a temp ref on them, or because they're transiently * sitting in the folio_add_lru() caches. */ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp) { int ret; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (folio_mapped(folio)) unmap_mapping_folio(folio); BUG_ON(folio_mapped(folio)); ret = folio_launder(mapping, folio); if (ret) return ret; if (folio->mapping != mapping) return -EBUSY; if (!filemap_release_folio(folio, gfp)) return -EBUSY; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); if (folio_test_dirty(folio)) goto failed; BUG_ON(folio_has_private(folio)); __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); return 1; failed: xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); return -EBUSY; } /** * invalidate_inode_pages2_range - remove range of pages from an address_space * @mapping: the address_space * @start: the page offset 'from' which to invalidate * @end: the page offset 'to' which to invalidate (inclusive) * * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * * Return: -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; struct folio_batch fbatch; pgoff_t index; int i; int ret = 0; int ret2 = 0; int did_range_unmap = 0; if (mapping_empty(mapping)) return 0; folio_batch_init(&fbatch); index = start; while (find_get_entries(mapping, &index, end, &fbatch, indices)) { bool xa_has_values = false; int nr = folio_batch_count(&fbatch); for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ if (xa_is_value(folio)) { xa_has_values = true; if (dax_mapping(mapping) && !dax_invalidate_mapping_entry_sync(mapping, indices[i])) ret = -EBUSY; continue; } if (!did_range_unmap && folio_mapped(folio)) { /* * If folio is mapped, before taking its lock, * zap the rest of the file in one hit. */ unmap_mapping_pages(mapping, indices[i], (1 + end - indices[i]), false); did_range_unmap = 1; } folio_lock(folio); if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); continue; } VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL); if (ret2 < 0) ret = ret2; folio_unlock(folio); } if (xa_has_values) clear_shadow_entries(mapping, indices[0], indices[nr-1]); folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); } /* * For DAX we invalidate page tables after invalidating page cache. We * could invalidate page tables while invalidating each entry however * that would be expensive. And doing range unmapping before doesn't * work as we have no cheap way to find whether page cache entry didn't * get remapped later. */ if (dax_mapping(mapping)) { unmap_mapping_pages(mapping, start, end - start + 1, false); } return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); /** * invalidate_inode_pages2 - remove all pages from an address_space * @mapping: the address_space * * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * * Return: -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2(struct address_space *mapping) { return invalidate_inode_pages2_range(mapping, 0, -1); } EXPORT_SYMBOL_GPL(invalidate_inode_pages2); /** * truncate_pagecache - unmap and remove pagecache that has been truncated * @inode: inode * @newsize: new file size * * inode's new i_size must already be written before truncate_pagecache * is called. * * This function should typically be called before the filesystem * releases resources associated with the freed range (eg. deallocates * blocks). This way, pagecache will always stay logically coherent * with on-disk format, and the filesystem would not have to deal with * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ void truncate_pagecache(struct inode *inode, loff_t newsize) { struct address_space *mapping = inode->i_mapping; loff_t holebegin = round_up(newsize, PAGE_SIZE); /* * unmap_mapping_range is called twice, first simply for * efficiency so that truncate_inode_pages does fewer * single-page unmaps. However after this first call, and * before truncate_inode_pages finishes, it is possible for * private pages to be COWed, which remain after * truncate_inode_pages finishes, hence the second * unmap_mapping_range call must be made for correctness. */ unmap_mapping_range(mapping, holebegin, 0, 1); truncate_inode_pages(mapping, newsize); unmap_mapping_range(mapping, holebegin, 0, 1); } EXPORT_SYMBOL(truncate_pagecache); /** * truncate_setsize - update inode and pagecache for a new file size * @inode: inode * @newsize: new file size * * truncate_setsize updates i_size and performs pagecache truncation (if * necessary) to @newsize. It will be typically be called from the filesystem's * setattr function when ATTR_SIZE is passed in. * * Must be called with a lock serializing truncates and writes (generally * i_rwsem but e.g. xfs uses a different lock) and before all filesystem * specific block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) { loff_t oldsize = inode->i_size; i_size_write(inode, newsize); if (newsize > oldsize) pagecache_isize_extended(inode, oldsize, newsize); truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); /** * pagecache_isize_extended - update pagecache after extension of i_size * @inode: inode for which i_size was extended * @from: original inode size * @to: new inode size * * Handle extension of inode size either caused by extending truncate or * by write starting after current i_size. We mark the page straddling * current i_size RO so that page_mkwrite() is called on the first * write access to the page. The filesystem will update its per-block * information before user writes to the page via mmap after the i_size * has been changed. * * The function must be called after i_size is updated so that page fault * coming after we unlock the folio will already see the new i_size. * The function must be called while we still hold i_rwsem - this not only * makes sure i_size is stable but also that userspace cannot observe new * i_size value before we are prepared to store mmap writes at new inode size. */ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) { int bsize = i_blocksize(inode); loff_t rounded_from; struct folio *folio; WARN_ON(to > inode->i_size); if (from >= to || bsize >= PAGE_SIZE) return; /* Page straddling @from will not have any hole block created? */ rounded_from = round_up(from, bsize); if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1))) return; folio = filemap_lock_folio(inode->i_mapping, from / PAGE_SIZE); /* Folio not cached? Nothing to do */ if (IS_ERR(folio)) return; /* * See folio_clear_dirty_for_io() for details why folio_mark_dirty() * is needed. */ if (folio_mkclean(folio)) folio_mark_dirty(folio); /* * The post-eof range of the folio must be zeroed before it is exposed * to the file. Writeback normally does this, but since i_size has been * increased we handle it here. */ if (folio_test_dirty(folio)) { unsigned int offset, end; offset = from - folio_pos(folio); end = min_t(unsigned int, to - folio_pos(folio), folio_size(folio)); folio_zero_segment(folio, offset, end); } folio_unlock(folio); folio_put(folio); } EXPORT_SYMBOL(pagecache_isize_extended); /** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode * @lstart: offset of beginning of hole * @lend: offset of last byte of hole * * This function should typically be called before the filesystem * releases resources associated with the freed range (eg. deallocates * blocks). This way, pagecache will always stay logically coherent * with on-disk format, and the filesystem would not have to deal with * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) { struct address_space *mapping = inode->i_mapping; loff_t unmap_start = round_up(lstart, PAGE_SIZE); loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; /* * This rounding is currently just for example: unmap_mapping_range * expands its hole outwards, whereas we want it to contract the hole * inwards. However, existing callers of truncate_pagecache_range are * doing their own page rounding first. Note that unmap_mapping_range * allows holelen 0 for all, and we allow lend -1 for end of file. */ /* * Unlike in truncate_pagecache, unmap_mapping_range is called only * once (before truncating pagecache), and without "even_cows" flag: * hole-punching should not remove private COWed pages from the hole. */ if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); truncate_inode_pages_range(mapping, lstart, lend); } EXPORT_SYMBOL(truncate_pagecache_range);
26 26 26 26 17 17 9 2 1 2 14 14 11 11 11 11 11 11 10 14 4 14 14 13 13 4 13 13 7 7 7 10 3 2 2 1 19 2 1 16 15 17 16 13 12 3 11 11 11 2 9 2 17 25 23 17 5 11 11 11 10 11 1 1 1 1 10 10 10 10 10 10 10 1 1 12 25 4 4 2 2 2 1 1 2 4 1 1 1 9 9 2 1 1 2 1 1 4 3 3 3 2 2 1 1 3 2 2 1 3 3 2 1 1 3 3 3 1 1 3 4 3 3 3 3 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 // SPDX-License-Identifier: GPL-2.0-or-later /***************************************************************************** * Linux PPP over L2TP (PPPoX/PPPoL2TP) Sockets * * PPPoX --- Generic PPP encapsulation socket family * PPPoL2TP --- PPP over L2TP (RFC 2661) * * Version: 2.0.0 * * Authors: James Chapman (jchapman@katalix.com) * * Based on original work by Martijn van Oosterhout <kleptog@svana.org> * * License: */ /* This driver handles only L2TP data frames; control frames are handled by a * userspace application. * * To send data in an L2TP session, userspace opens a PPPoL2TP socket and * attaches it to a bound UDP socket with local tunnel_id / session_id and * peer tunnel_id / session_id set. Data can then be sent or received using * regular socket sendmsg() / recvmsg() calls. Kernel parameters of the socket * can be read or modified using ioctl() or [gs]etsockopt() calls. * * When a PPPoL2TP socket is connected with local and peer session_id values * zero, the socket is treated as a special tunnel management socket. * * Here's example userspace code to create a socket for sending/receiving data * over an L2TP session:- * * struct sockaddr_pppol2tp sax; * int fd; * int session_fd; * * fd = socket(AF_PPPOX, SOCK_DGRAM, PX_PROTO_OL2TP); * * sax.sa_family = AF_PPPOX; * sax.sa_protocol = PX_PROTO_OL2TP; * sax.pppol2tp.fd = tunnel_fd; // bound UDP socket * sax.pppol2tp.addr.sin_addr.s_addr = addr->sin_addr.s_addr; * sax.pppol2tp.addr.sin_port = addr->sin_port; * sax.pppol2tp.addr.sin_family = AF_INET; * sax.pppol2tp.s_tunnel = tunnel_id; * sax.pppol2tp.s_session = session_id; * sax.pppol2tp.d_tunnel = peer_tunnel_id; * sax.pppol2tp.d_session = peer_session_id; * * session_fd = connect(fd, (struct sockaddr *)&sax, sizeof(sax)); * * A pppd plugin that allows PPP traffic to be carried over L2TP using * this driver is available from the OpenL2TP project at * http://openl2tp.sourceforge.net. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/string.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/spinlock.h> #include <linux/kthread.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/udp.h> #include <linux/if_pppox.h> #include <linux/if_pppol2tp.h> #include <net/sock.h> #include <linux/ppp_channel.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/file.h> #include <linux/hash.h> #include <linux/sort.h> #include <linux/proc_fs.h> #include <linux/l2tp.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/ip.h> #include <net/udp.h> #include <net/inet_common.h> #include <asm/byteorder.h> #include <linux/atomic.h> #include "l2tp_core.h" #define PPPOL2TP_DRV_VERSION "V2.0" /* Space for UDP, L2TP and PPP headers */ #define PPPOL2TP_HEADER_OVERHEAD 40 /* Number of bytes to build transmit L2TP headers. * Unfortunately the size is different depending on whether sequence numbers * are enabled. */ #define PPPOL2TP_L2TP_HDR_SIZE_SEQ 10 #define PPPOL2TP_L2TP_HDR_SIZE_NOSEQ 6 /* Private data of each session. This data lives at the end of struct * l2tp_session, referenced via session->priv[]. */ struct pppol2tp_session { int owner; /* pid that opened the socket */ struct mutex sk_lock; /* Protects .sk */ struct sock __rcu *sk; /* Pointer to the session PPPoX socket */ struct sock *__sk; /* Copy of .sk, for cleanup */ }; static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb); static const struct ppp_channel_ops pppol2tp_chan_ops = { .start_xmit = pppol2tp_xmit, }; static const struct proto_ops pppol2tp_ops; /* Retrieves the pppol2tp socket associated to a session. */ static struct sock *pppol2tp_session_get_sock(struct l2tp_session *session) { struct pppol2tp_session *ps = l2tp_session_priv(session); return rcu_dereference(ps->sk); } /* Helpers to obtain tunnel/session contexts from sockets. */ static struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk) { struct l2tp_session *session; if (!sk) return NULL; rcu_read_lock(); session = rcu_dereference_sk_user_data(sk); if (session && refcount_inc_not_zero(&session->ref_count)) { rcu_read_unlock(); WARN_ON_ONCE(session->magic != L2TP_SESSION_MAGIC); return session; } rcu_read_unlock(); return NULL; } /***************************************************************************** * Receive data handling *****************************************************************************/ /* Receive message. This is the recvmsg for the PPPoL2TP socket. */ static int pppol2tp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { int err; struct sk_buff *skb; struct sock *sk = sock->sk; err = -EIO; if (sk->sk_state & PPPOX_BOUND) goto end; err = 0; skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto end; if (len > skb->len) len = skb->len; else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; err = skb_copy_datagram_msg(skb, 0, msg, len); if (likely(err == 0)) err = len; kfree_skb(skb); end: return err; } static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) { struct sock *sk; /* If the socket is bound, send it in to PPP's input queue. Otherwise * queue it on the session socket. */ rcu_read_lock(); sk = pppol2tp_session_get_sock(session); if (!sk) goto no_sock; /* If the first two bytes are 0xFF03, consider that it is the PPP's * Address and Control fields and skip them. The L2TP module has always * worked this way, although, in theory, the use of these fields should * be negotiated and handled at the PPP layer. These fields are * constant: 0xFF is the All-Stations Address and 0x03 the Unnumbered * Information command with Poll/Final bit set to zero (RFC 1662). */ if (pskb_may_pull(skb, 2) && skb->data[0] == PPP_ALLSTATIONS && skb->data[1] == PPP_UI) skb_pull(skb, 2); if (sk->sk_state & PPPOX_BOUND) { struct pppox_sock *po; po = pppox_sk(sk); ppp_input(&po->chan, skb); } else { if (sock_queue_rcv_skb(sk, skb) < 0) { atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); } } rcu_read_unlock(); return; no_sock: rcu_read_unlock(); pr_warn_ratelimited("%s: no socket in recv\n", session->name); kfree_skb(skb); } /************************************************************************ * Transmit handling ***********************************************************************/ /* This is the sendmsg for the PPPoL2TP pppol2tp_session socket. We come here * when a user application does a sendmsg() on the session socket. L2TP and * PPP headers must be inserted into the user's data. */ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct sock *sk = sock->sk; struct sk_buff *skb; int error; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int uhlen; error = -ENOTCONN; if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) goto error; /* Get session and tunnel contexts */ error = -EBADF; session = pppol2tp_sock_to_session(sk); if (!session) goto error; tunnel = session->tunnel; uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; /* Allocate a socket buffer */ error = -ENOMEM; skb = sock_wmalloc(sk, NET_SKB_PAD + sizeof(struct iphdr) + uhlen + session->hdr_len + 2 + total_len, /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ 0, GFP_KERNEL); if (!skb) goto error_put_sess; /* Reserve space for headers. */ skb_reserve(skb, NET_SKB_PAD); skb_reset_network_header(skb); skb_reserve(skb, sizeof(struct iphdr)); skb_reset_transport_header(skb); skb_reserve(skb, uhlen); /* Add PPP header */ skb->data[0] = PPP_ALLSTATIONS; skb->data[1] = PPP_UI; skb_put(skb, 2); /* Copy user data into skb */ error = memcpy_from_msg(skb_put(skb, total_len), m, total_len); if (error < 0) { kfree_skb(skb); goto error_put_sess; } local_bh_disable(); l2tp_xmit_skb(session, skb); local_bh_enable(); l2tp_session_put(session); return total_len; error_put_sess: l2tp_session_put(session); error: return error; } /* Transmit function called by generic PPP driver. Sends PPP frame * over PPPoL2TP socket. * * This is almost the same as pppol2tp_sendmsg(), but rather than * being called with a msghdr from userspace, it is called with a skb * from the kernel. * * The supplied skb from ppp doesn't have enough headroom for the * insertion of L2TP, UDP and IP headers so we need to allocate more * headroom in the skb. This will create a cloned skb. But we must be * careful in the error case because the caller will expect to free * the skb it supplied, not our cloned skb. So we take care to always * leave the original skb unfreed if we return an error. */ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { struct sock *sk = (struct sock *)chan->private; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int uhlen, headroom; if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) goto abort; /* Get session and tunnel contexts from the socket */ session = pppol2tp_sock_to_session(sk); if (!session) goto abort; tunnel = session->tunnel; uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; headroom = NET_SKB_PAD + sizeof(struct iphdr) + /* IP header */ uhlen + /* UDP header (if L2TP_ENCAPTYPE_UDP) */ session->hdr_len + /* L2TP header */ 2; /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ if (skb_cow_head(skb, headroom)) goto abort_put_sess; /* Setup PPP header */ __skb_push(skb, 2); skb->data[0] = PPP_ALLSTATIONS; skb->data[1] = PPP_UI; local_bh_disable(); l2tp_xmit_skb(session, skb); local_bh_enable(); l2tp_session_put(session); return 1; abort_put_sess: l2tp_session_put(session); abort: /* Free the original skb */ kfree_skb(skb); return 1; } /***************************************************************************** * Session (and tunnel control) socket create/destroy. *****************************************************************************/ /* Really kill the session socket. (Called from sock_put() if * refcnt == 0.) */ static void pppol2tp_session_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } static void pppol2tp_session_close(struct l2tp_session *session) { struct pppol2tp_session *ps; ps = l2tp_session_priv(session); mutex_lock(&ps->sk_lock); ps->__sk = rcu_dereference_protected(ps->sk, lockdep_is_held(&ps->sk_lock)); RCU_INIT_POINTER(ps->sk, NULL); mutex_unlock(&ps->sk_lock); if (ps->__sk) { /* detach socket */ rcu_assign_sk_user_data(ps->__sk, NULL); sock_put(ps->__sk); /* drop ref taken when we referenced socket via sk_user_data */ l2tp_session_put(session); } } /* Called when the PPPoX socket (session) is closed. */ static int pppol2tp_release(struct socket *sock) { struct sock *sk = sock->sk; struct l2tp_session *session; int error; if (!sk) return 0; error = -EBADF; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD) != 0) goto error; pppox_unbind_sock(sk); /* Signal the death of the socket. */ sk->sk_state = PPPOX_DEAD; sock_orphan(sk); sock->sk = NULL; session = pppol2tp_sock_to_session(sk); if (session) { l2tp_session_delete(session); /* drop ref taken by pppol2tp_sock_to_session */ l2tp_session_put(session); } release_sock(sk); sock_put(sk); return 0; error: release_sock(sk); return error; } static struct proto pppol2tp_sk_proto = { .name = "PPPOL2TP", .owner = THIS_MODULE, .obj_size = sizeof(struct pppox_sock), }; static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; rc = l2tp_udp_encap_recv(sk, skb); if (rc) kfree_skb(skb); return NET_RX_SUCCESS; } /* socket() handler. Initialize a new struct sock. */ static int pppol2tp_create(struct net *net, struct socket *sock, int kern) { int error = -ENOMEM; struct sock *sk; sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto, kern); if (!sk) goto out; sock_init_data(sock, sk); sock_set_flag(sk, SOCK_RCU_FREE); sock->state = SS_UNCONNECTED; sock->ops = &pppol2tp_ops; sk->sk_backlog_rcv = pppol2tp_backlog_recv; sk->sk_protocol = PX_PROTO_OL2TP; sk->sk_family = PF_PPPOX; sk->sk_state = PPPOX_NONE; sk->sk_type = SOCK_STREAM; sk->sk_destruct = pppol2tp_session_destruct; error = 0; out: return error; } static void pppol2tp_show(struct seq_file *m, void *arg) { struct l2tp_session *session = arg; struct sock *sk; rcu_read_lock(); sk = pppol2tp_session_get_sock(session); if (sk) { struct pppox_sock *po = pppox_sk(sk); seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); } rcu_read_unlock(); } static void pppol2tp_session_init(struct l2tp_session *session) { struct pppol2tp_session *ps; session->recv_skb = pppol2tp_recv; session->session_close = pppol2tp_session_close; if (IS_ENABLED(CONFIG_L2TP_DEBUGFS)) session->show = pppol2tp_show; ps = l2tp_session_priv(session); mutex_init(&ps->sk_lock); ps->owner = current->pid; } struct l2tp_connect_info { u8 version; int fd; u32 tunnel_id; u32 peer_tunnel_id; u32 session_id; u32 peer_session_id; }; static int pppol2tp_sockaddr_get_info(const void *sa, int sa_len, struct l2tp_connect_info *info) { switch (sa_len) { case sizeof(struct sockaddr_pppol2tp): { const struct sockaddr_pppol2tp *sa_v2in4 = sa; if (sa_v2in4->sa_protocol != PX_PROTO_OL2TP) return -EINVAL; info->version = 2; info->fd = sa_v2in4->pppol2tp.fd; info->tunnel_id = sa_v2in4->pppol2tp.s_tunnel; info->peer_tunnel_id = sa_v2in4->pppol2tp.d_tunnel; info->session_id = sa_v2in4->pppol2tp.s_session; info->peer_session_id = sa_v2in4->pppol2tp.d_session; break; } case sizeof(struct sockaddr_pppol2tpv3): { const struct sockaddr_pppol2tpv3 *sa_v3in4 = sa; if (sa_v3in4->sa_protocol != PX_PROTO_OL2TP) return -EINVAL; info->version = 3; info->fd = sa_v3in4->pppol2tp.fd; info->tunnel_id = sa_v3in4->pppol2tp.s_tunnel; info->peer_tunnel_id = sa_v3in4->pppol2tp.d_tunnel; info->session_id = sa_v3in4->pppol2tp.s_session; info->peer_session_id = sa_v3in4->pppol2tp.d_session; break; } case sizeof(struct sockaddr_pppol2tpin6): { const struct sockaddr_pppol2tpin6 *sa_v2in6 = sa; if (sa_v2in6->sa_protocol != PX_PROTO_OL2TP) return -EINVAL; info->version = 2; info->fd = sa_v2in6->pppol2tp.fd; info->tunnel_id = sa_v2in6->pppol2tp.s_tunnel; info->peer_tunnel_id = sa_v2in6->pppol2tp.d_tunnel; info->session_id = sa_v2in6->pppol2tp.s_session; info->peer_session_id = sa_v2in6->pppol2tp.d_session; break; } case sizeof(struct sockaddr_pppol2tpv3in6): { const struct sockaddr_pppol2tpv3in6 *sa_v3in6 = sa; if (sa_v3in6->sa_protocol != PX_PROTO_OL2TP) return -EINVAL; info->version = 3; info->fd = sa_v3in6->pppol2tp.fd; info->tunnel_id = sa_v3in6->pppol2tp.s_tunnel; info->peer_tunnel_id = sa_v3in6->pppol2tp.d_tunnel; info->session_id = sa_v3in6->pppol2tp.s_session; info->peer_session_id = sa_v3in6->pppol2tp.d_session; break; } default: return -EINVAL; } return 0; } /* Rough estimation of the maximum payload size a tunnel can transmit without * fragmenting at the lower IP layer. Assumes L2TPv2 with sequence * numbers and no IP option. Not quite accurate, but the result is mostly * unused anyway. */ static int pppol2tp_tunnel_mtu(const struct l2tp_tunnel *tunnel) { int mtu; mtu = l2tp_tunnel_dst_mtu(tunnel); if (mtu <= PPPOL2TP_HEADER_OVERHEAD) return 1500 - PPPOL2TP_HEADER_OVERHEAD; return mtu - PPPOL2TP_HEADER_OVERHEAD; } static struct l2tp_tunnel *pppol2tp_tunnel_get(struct net *net, const struct l2tp_connect_info *info, bool *new_tunnel) { struct l2tp_tunnel *tunnel; int error; *new_tunnel = false; tunnel = l2tp_tunnel_get(net, info->tunnel_id); /* Special case: create tunnel context if session_id and * peer_session_id is 0. Otherwise look up tunnel using supplied * tunnel id. */ if (!info->session_id && !info->peer_session_id) { if (!tunnel) { struct l2tp_tunnel_cfg tcfg = { .encap = L2TP_ENCAPTYPE_UDP, }; /* Prevent l2tp_tunnel_register() from trying to set up * a kernel socket. */ if (info->fd < 0) return ERR_PTR(-EBADF); error = l2tp_tunnel_create(info->fd, info->version, info->tunnel_id, info->peer_tunnel_id, &tcfg, &tunnel); if (error < 0) return ERR_PTR(error); refcount_inc(&tunnel->ref_count); error = l2tp_tunnel_register(tunnel, net, &tcfg); if (error < 0) { kfree(tunnel); return ERR_PTR(error); } *new_tunnel = true; } } else { /* Error if we can't find the tunnel */ if (!tunnel) return ERR_PTR(-ENOENT); /* Error if socket is not prepped */ if (!tunnel->sock) { l2tp_tunnel_put(tunnel); return ERR_PTR(-ENOENT); } } return tunnel; } /* connect() handler. Attach a PPPoX socket to a tunnel UDP socket */ static int pppol2tp_connect(struct socket *sock, struct sockaddr_unsized *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); struct l2tp_session *session = NULL; struct l2tp_connect_info info; struct l2tp_tunnel *tunnel; struct pppol2tp_session *ps; struct l2tp_session_cfg cfg = { 0, }; bool drop_refcnt = false; bool new_session = false; bool new_tunnel = false; int error; error = pppol2tp_sockaddr_get_info(uservaddr, sockaddr_len, &info); if (error < 0) return error; /* Don't bind if tunnel_id is 0 */ if (!info.tunnel_id) return -EINVAL; tunnel = pppol2tp_tunnel_get(sock_net(sk), &info, &new_tunnel); if (IS_ERR(tunnel)) return PTR_ERR(tunnel); lock_sock(sk); /* Check for already bound sockets */ error = -EBUSY; if (sk->sk_state & PPPOX_CONNECTED) goto end; /* We don't supporting rebinding anyway */ error = -EALREADY; if (sk->sk_user_data) goto end; /* socket is already attached */ if (tunnel->peer_tunnel_id == 0) tunnel->peer_tunnel_id = info.peer_tunnel_id; session = l2tp_session_get(sock_net(sk), tunnel->sock, tunnel->version, info.tunnel_id, info.session_id); if (session) { drop_refcnt = true; if (session->pwtype != L2TP_PWTYPE_PPP) { error = -EPROTOTYPE; goto end; } ps = l2tp_session_priv(session); /* Using a pre-existing session is fine as long as it hasn't * been connected yet. */ mutex_lock(&ps->sk_lock); if (rcu_dereference_protected(ps->sk, lockdep_is_held(&ps->sk_lock)) || ps->__sk) { mutex_unlock(&ps->sk_lock); error = -EEXIST; goto end; } } else { cfg.pw_type = L2TP_PWTYPE_PPP; session = l2tp_session_create(sizeof(struct pppol2tp_session), tunnel, info.session_id, info.peer_session_id, &cfg); if (IS_ERR(session)) { error = PTR_ERR(session); goto end; } drop_refcnt = true; pppol2tp_session_init(session); ps = l2tp_session_priv(session); refcount_inc(&session->ref_count); mutex_lock(&ps->sk_lock); error = l2tp_session_register(session, tunnel); if (error < 0) { mutex_unlock(&ps->sk_lock); l2tp_session_put(session); goto end; } new_session = true; } /* Special case: if source & dest session_id == 0x0000, this * socket is being created to manage the tunnel. Just set up * the internal context for use by ioctl() and sockopt() * handlers. */ if (session->session_id == 0 && session->peer_session_id == 0) { error = 0; goto out_no_ppp; } /* The only header we need to worry about is the L2TP * header. This size is different depending on whether * sequence numbers are enabled for the data channel. */ po->chan.hdrlen = PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; po->chan.private = sk; po->chan.ops = &pppol2tp_chan_ops; po->chan.mtu = pppol2tp_tunnel_mtu(tunnel); po->chan.direct_xmit = true; error = ppp_register_net_channel(sock_net(sk), &po->chan); if (error) { mutex_unlock(&ps->sk_lock); goto end; } out_no_ppp: /* This is how we get the session context from the socket. */ sock_hold(sk); rcu_assign_sk_user_data(sk, session); rcu_assign_pointer(ps->sk, sk); mutex_unlock(&ps->sk_lock); /* Keep the reference we've grabbed on the session: sk doesn't expect * the session to disappear. pppol2tp_session_close() is responsible * for dropping it. */ drop_refcnt = false; sk->sk_state = PPPOX_CONNECTED; end: if (error) { if (new_session) l2tp_session_delete(session); if (new_tunnel) l2tp_tunnel_delete(tunnel); } if (drop_refcnt) l2tp_session_put(session); l2tp_tunnel_put(tunnel); release_sock(sk); return error; } #ifdef CONFIG_L2TP_V3 /* Called when creating sessions via the netlink interface. */ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) { int error; struct l2tp_session *session; /* Error if tunnel socket is not prepped */ if (!tunnel->sock) { error = -ENOENT; goto err; } /* Allocate and initialize a new session context. */ session = l2tp_session_create(sizeof(struct pppol2tp_session), tunnel, session_id, peer_session_id, cfg); if (IS_ERR(session)) { error = PTR_ERR(session); goto err; } pppol2tp_session_init(session); error = l2tp_session_register(session, tunnel); if (error < 0) goto err_sess; return 0; err_sess: l2tp_session_put(session); err: return error; } #endif /* CONFIG_L2TP_V3 */ /* getname() support. */ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { int len = 0; int error = 0; struct l2tp_session *session; struct l2tp_tunnel *tunnel; struct sock *sk = sock->sk; struct inet_sock *inet; struct pppol2tp_session *pls; error = -ENOTCONN; if (!sk) goto end; if (!(sk->sk_state & PPPOX_CONNECTED)) goto end; error = -EBADF; session = pppol2tp_sock_to_session(sk); if (!session) goto end; pls = l2tp_session_priv(session); tunnel = session->tunnel; inet = inet_sk(tunnel->sock); if (tunnel->version == 2 && tunnel->sock->sk_family == AF_INET) { struct sockaddr_pppol2tp sp; len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OL2TP; sp.pppol2tp.fd = tunnel->fd; sp.pppol2tp.pid = pls->owner; sp.pppol2tp.s_tunnel = tunnel->tunnel_id; sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; sp.pppol2tp.s_session = session->session_id; sp.pppol2tp.d_session = session->peer_session_id; sp.pppol2tp.addr.sin_family = AF_INET; sp.pppol2tp.addr.sin_port = inet->inet_dport; sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr; memcpy(uaddr, &sp, len); #if IS_ENABLED(CONFIG_IPV6) } else if (tunnel->version == 2 && tunnel->sock->sk_family == AF_INET6) { struct sockaddr_pppol2tpin6 sp; len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OL2TP; sp.pppol2tp.fd = tunnel->fd; sp.pppol2tp.pid = pls->owner; sp.pppol2tp.s_tunnel = tunnel->tunnel_id; sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; sp.pppol2tp.s_session = session->session_id; sp.pppol2tp.d_session = session->peer_session_id; sp.pppol2tp.addr.sin6_family = AF_INET6; sp.pppol2tp.addr.sin6_port = inet->inet_dport; memcpy(&sp.pppol2tp.addr.sin6_addr, &tunnel->sock->sk_v6_daddr, sizeof(tunnel->sock->sk_v6_daddr)); memcpy(uaddr, &sp, len); } else if (tunnel->version == 3 && tunnel->sock->sk_family == AF_INET6) { struct sockaddr_pppol2tpv3in6 sp; len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OL2TP; sp.pppol2tp.fd = tunnel->fd; sp.pppol2tp.pid = pls->owner; sp.pppol2tp.s_tunnel = tunnel->tunnel_id; sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; sp.pppol2tp.s_session = session->session_id; sp.pppol2tp.d_session = session->peer_session_id; sp.pppol2tp.addr.sin6_family = AF_INET6; sp.pppol2tp.addr.sin6_port = inet->inet_dport; memcpy(&sp.pppol2tp.addr.sin6_addr, &tunnel->sock->sk_v6_daddr, sizeof(tunnel->sock->sk_v6_daddr)); memcpy(uaddr, &sp, len); #endif } else if (tunnel->version == 3) { struct sockaddr_pppol2tpv3 sp; len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OL2TP; sp.pppol2tp.fd = tunnel->fd; sp.pppol2tp.pid = pls->owner; sp.pppol2tp.s_tunnel = tunnel->tunnel_id; sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; sp.pppol2tp.s_session = session->session_id; sp.pppol2tp.d_session = session->peer_session_id; sp.pppol2tp.addr.sin_family = AF_INET; sp.pppol2tp.addr.sin_port = inet->inet_dport; sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr; memcpy(uaddr, &sp, len); } error = len; l2tp_session_put(session); end: return error; } /**************************************************************************** * ioctl() handlers. * * The PPPoX socket is created for L2TP sessions: tunnels have their own UDP * sockets. However, in order to control kernel tunnel features, we allow * userspace to create a special "tunnel" PPPoX socket which is used for * control only. Tunnel PPPoX sockets have session_id == 0 and simply allow * the user application to issue L2TP setsockopt(), getsockopt() and ioctl() * calls. ****************************************************************************/ static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest, const struct l2tp_stats *stats) { memset(dest, 0, sizeof(*dest)); dest->tx_packets = atomic_long_read(&stats->tx_packets); dest->tx_bytes = atomic_long_read(&stats->tx_bytes); dest->tx_errors = atomic_long_read(&stats->tx_errors); dest->rx_packets = atomic_long_read(&stats->rx_packets); dest->rx_bytes = atomic_long_read(&stats->rx_bytes); dest->rx_seq_discards = atomic_long_read(&stats->rx_seq_discards); dest->rx_oos_packets = atomic_long_read(&stats->rx_oos_packets); dest->rx_errors = atomic_long_read(&stats->rx_errors); } static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats, struct l2tp_tunnel *tunnel) { struct l2tp_session *session; if (!stats->session_id) { pppol2tp_copy_stats(stats, &tunnel->stats); return 0; } /* If session_id is set, search the corresponding session in the * context of this tunnel and record the session's statistics. */ session = l2tp_session_get(tunnel->l2tp_net, tunnel->sock, tunnel->version, tunnel->tunnel_id, stats->session_id); if (!session) return -EBADR; if (session->pwtype != L2TP_PWTYPE_PPP) { l2tp_session_put(session); return -EBADR; } pppol2tp_copy_stats(stats, &session->stats); l2tp_session_put(session); return 0; } static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct pppol2tp_ioc_stats stats; struct l2tp_session *session; switch (cmd) { case PPPIOCGMRU: case PPPIOCGFLAGS: session = sock->sk->sk_user_data; if (!session) return -ENOTCONN; if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) return -EBADF; /* Not defined for tunnels */ if (!session->session_id && !session->peer_session_id) return -ENOSYS; if (put_user(0, (int __user *)arg)) return -EFAULT; break; case PPPIOCSMRU: case PPPIOCSFLAGS: session = sock->sk->sk_user_data; if (!session) return -ENOTCONN; if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) return -EBADF; /* Not defined for tunnels */ if (!session->session_id && !session->peer_session_id) return -ENOSYS; if (!access_ok((int __user *)arg, sizeof(int))) return -EFAULT; break; case PPPIOCGL2TPSTATS: session = sock->sk->sk_user_data; if (!session) return -ENOTCONN; if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) return -EBADF; /* Session 0 represents the parent tunnel */ if (!session->session_id && !session->peer_session_id) { u32 session_id; int err; if (copy_from_user(&stats, (void __user *)arg, sizeof(stats))) return -EFAULT; session_id = stats.session_id; err = pppol2tp_tunnel_copy_stats(&stats, session->tunnel); if (err < 0) return err; stats.session_id = session_id; } else { pppol2tp_copy_stats(&stats, &session->stats); stats.session_id = session->session_id; } stats.tunnel_id = session->tunnel->tunnel_id; stats.using_ipsec = l2tp_tunnel_uses_xfrm(session->tunnel); if (copy_to_user((void __user *)arg, &stats, sizeof(stats))) return -EFAULT; break; default: return -ENOIOCTLCMD; } return 0; } /***************************************************************************** * setsockopt() / getsockopt() support. * * The PPPoX socket is created for L2TP sessions: tunnels have their own UDP * sockets. In order to control kernel tunnel features, we allow userspace to * create a special "tunnel" PPPoX socket which is used for control only. * Tunnel PPPoX sockets have session_id == 0 and simply allow the user * application to issue L2TP setsockopt(), getsockopt() and ioctl() calls. *****************************************************************************/ /* Tunnel setsockopt() helper. */ static int pppol2tp_tunnel_setsockopt(struct sock *sk, struct l2tp_tunnel *tunnel, int optname, int val) { int err = 0; switch (optname) { case PPPOL2TP_SO_DEBUG: /* Tunnel debug flags option is deprecated */ break; default: err = -ENOPROTOOPT; break; } return err; } /* Session setsockopt helper. */ static int pppol2tp_session_setsockopt(struct sock *sk, struct l2tp_session *session, int optname, int val) { int err = 0; switch (optname) { case PPPOL2TP_SO_RECVSEQ: if (val != 0 && val != 1) { err = -EINVAL; break; } session->recv_seq = !!val; break; case PPPOL2TP_SO_SENDSEQ: if (val != 0 && val != 1) { err = -EINVAL; break; } session->send_seq = !!val; { struct pppox_sock *po = pppox_sk(sk); po->chan.hdrlen = val ? PPPOL2TP_L2TP_HDR_SIZE_SEQ : PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; } l2tp_session_set_header_len(session, session->tunnel->version, session->tunnel->encap); break; case PPPOL2TP_SO_LNSMODE: if (val != 0 && val != 1) { err = -EINVAL; break; } session->lns_mode = !!val; break; case PPPOL2TP_SO_DEBUG: /* Session debug flags option is deprecated */ break; case PPPOL2TP_SO_REORDERTO: session->reorder_timeout = msecs_to_jiffies(val); break; default: err = -ENOPROTOOPT; break; } return err; } /* Main setsockopt() entry point. * Does API checks, then calls either the tunnel or session setsockopt * handler, according to whether the PPPoL2TP socket is a for a regular * session or the special tunnel type. */ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int val; int err; if (level != SOL_PPPOL2TP) return -EINVAL; if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; err = -ENOTCONN; if (!sk->sk_user_data) goto end; /* Get session context from the socket */ err = -EBADF; session = pppol2tp_sock_to_session(sk); if (!session) goto end; /* Special case: if session_id == 0x0000, treat as operation on tunnel */ if (session->session_id == 0 && session->peer_session_id == 0) { tunnel = session->tunnel; err = pppol2tp_tunnel_setsockopt(sk, tunnel, optname, val); } else { err = pppol2tp_session_setsockopt(sk, session, optname, val); } l2tp_session_put(session); end: return err; } /* Tunnel getsockopt helper. Called with sock locked. */ static int pppol2tp_tunnel_getsockopt(struct sock *sk, struct l2tp_tunnel *tunnel, int optname, int *val) { int err = 0; switch (optname) { case PPPOL2TP_SO_DEBUG: /* Tunnel debug flags option is deprecated */ *val = 0; break; default: err = -ENOPROTOOPT; break; } return err; } /* Session getsockopt helper. Called with sock locked. */ static int pppol2tp_session_getsockopt(struct sock *sk, struct l2tp_session *session, int optname, int *val) { int err = 0; switch (optname) { case PPPOL2TP_SO_RECVSEQ: *val = session->recv_seq; break; case PPPOL2TP_SO_SENDSEQ: *val = session->send_seq; break; case PPPOL2TP_SO_LNSMODE: *val = session->lns_mode; break; case PPPOL2TP_SO_DEBUG: /* Session debug flags option is deprecated */ *val = 0; break; case PPPOL2TP_SO_REORDERTO: *val = (int)jiffies_to_msecs(session->reorder_timeout); break; default: err = -ENOPROTOOPT; } return err; } /* Main getsockopt() entry point. * Does API checks, then calls either the tunnel or session getsockopt * handler, according to whether the PPPoX socket is a for a regular session * or the special tunnel type. */ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int val, len; int err; if (level != SOL_PPPOL2TP) return -EINVAL; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; len = min_t(unsigned int, len, sizeof(int)); err = -ENOTCONN; if (!sk->sk_user_data) goto end; /* Get the session context */ err = -EBADF; session = pppol2tp_sock_to_session(sk); if (!session) goto end; /* Special case: if session_id == 0x0000, treat as operation on tunnel */ if (session->session_id == 0 && session->peer_session_id == 0) { tunnel = session->tunnel; err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val); if (err) goto end_put_sess; } else { err = pppol2tp_session_getsockopt(sk, session, optname, &val); if (err) goto end_put_sess; } err = -EFAULT; if (put_user(len, optlen)) goto end_put_sess; if (copy_to_user((void __user *)optval, &val, len)) goto end_put_sess; err = 0; end_put_sess: l2tp_session_put(session); end: return err; } /***************************************************************************** * /proc filesystem for debug * Since the original pppol2tp driver provided /proc/net/pppol2tp for * L2TPv2, we dump only L2TPv2 tunnels and sessions here. *****************************************************************************/ #ifdef CONFIG_PROC_FS struct pppol2tp_seq_data { struct seq_net_private p; unsigned long tkey; /* lookup key of current tunnel */ unsigned long skey; /* lookup key of current session */ struct l2tp_tunnel *tunnel; struct l2tp_session *session; /* NULL means get next tunnel */ }; static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd) { /* Drop reference taken during previous invocation */ if (pd->tunnel) l2tp_tunnel_put(pd->tunnel); for (;;) { pd->tunnel = l2tp_tunnel_get_next(net, &pd->tkey); pd->tkey++; /* Only accept L2TPv2 tunnels */ if (!pd->tunnel || pd->tunnel->version == 2) return; l2tp_tunnel_put(pd->tunnel); } } static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd) { /* Drop reference taken during previous invocation */ if (pd->session) l2tp_session_put(pd->session); pd->session = l2tp_session_get_next(net, pd->tunnel->sock, pd->tunnel->version, pd->tunnel->tunnel_id, &pd->skey); pd->skey++; if (!pd->session) { pd->skey = 0; pppol2tp_next_tunnel(net, pd); } } static void *pppol2tp_seq_start(struct seq_file *m, loff_t *offs) { struct pppol2tp_seq_data *pd = SEQ_START_TOKEN; loff_t pos = *offs; struct net *net; if (!pos) goto out; if (WARN_ON(!m->private)) { pd = NULL; goto out; } pd = m->private; net = seq_file_net(m); if (!pd->tunnel) pppol2tp_next_tunnel(net, pd); else pppol2tp_next_session(net, pd); /* NULL tunnel and session indicates end of list */ if (!pd->tunnel && !pd->session) pd = NULL; out: return pd; } static void *pppol2tp_seq_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return NULL; } static void pppol2tp_seq_stop(struct seq_file *p, void *v) { struct pppol2tp_seq_data *pd = v; if (!pd || pd == SEQ_START_TOKEN) return; /* Drop reference taken by last invocation of pppol2tp_next_session() * or pppol2tp_next_tunnel(). */ if (pd->session) { l2tp_session_put(pd->session); pd->session = NULL; } if (pd->tunnel) { l2tp_tunnel_put(pd->tunnel); pd->tunnel = NULL; } } static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v) { struct l2tp_tunnel *tunnel = v; seq_printf(m, "\nTUNNEL '%s', %c %d\n", tunnel->name, tunnel->sock ? 'Y' : 'N', refcount_read(&tunnel->ref_count) - 1); seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n", 0, atomic_long_read(&tunnel->stats.tx_packets), atomic_long_read(&tunnel->stats.tx_bytes), atomic_long_read(&tunnel->stats.tx_errors), atomic_long_read(&tunnel->stats.rx_packets), atomic_long_read(&tunnel->stats.rx_bytes), atomic_long_read(&tunnel->stats.rx_errors)); } static void pppol2tp_seq_session_show(struct seq_file *m, void *v) { struct l2tp_session *session = v; struct l2tp_tunnel *tunnel = session->tunnel; unsigned char state; char user_data_ok; struct sock *sk; u32 ip = 0; u16 port = 0; if (tunnel->sock) { struct inet_sock *inet = inet_sk(tunnel->sock); ip = ntohl(inet->inet_saddr); port = ntohs(inet->inet_sport); } rcu_read_lock(); sk = pppol2tp_session_get_sock(session); if (sk) { state = sk->sk_state; user_data_ok = (session == sk->sk_user_data) ? 'Y' : 'N'; } else { state = 0; user_data_ok = 'N'; } seq_printf(m, " SESSION '%s' %08X/%d %04X/%04X -> %04X/%04X %d %c\n", session->name, ip, port, tunnel->tunnel_id, session->session_id, tunnel->peer_tunnel_id, session->peer_session_id, state, user_data_ok); seq_printf(m, " 0/0/%c/%c/%s %08x %u\n", session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', session->lns_mode ? "LNS" : "LAC", 0, jiffies_to_msecs(session->reorder_timeout)); seq_printf(m, " %u/%u %ld/%ld/%ld %ld/%ld/%ld\n", session->nr, session->ns, atomic_long_read(&session->stats.tx_packets), atomic_long_read(&session->stats.tx_bytes), atomic_long_read(&session->stats.tx_errors), atomic_long_read(&session->stats.rx_packets), atomic_long_read(&session->stats.rx_bytes), atomic_long_read(&session->stats.rx_errors)); if (sk) { struct pppox_sock *po = pppox_sk(sk); seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); } rcu_read_unlock(); } static int pppol2tp_seq_show(struct seq_file *m, void *v) { struct pppol2tp_seq_data *pd = v; /* display header on line 1 */ if (v == SEQ_START_TOKEN) { seq_puts(m, "PPPoL2TP driver info, " PPPOL2TP_DRV_VERSION "\n"); seq_puts(m, "TUNNEL name, user-data-ok session-count\n"); seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); seq_puts(m, " SESSION name, addr/port src-tid/sid dest-tid/sid state user-data-ok\n"); seq_puts(m, " mtu/mru/rcvseq/sendseq/lns debug reorderto\n"); seq_puts(m, " nr/ns tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); goto out; } if (!pd->session) pppol2tp_seq_tunnel_show(m, pd->tunnel); else pppol2tp_seq_session_show(m, pd->session); out: return 0; } static const struct seq_operations pppol2tp_seq_ops = { .start = pppol2tp_seq_start, .next = pppol2tp_seq_next, .stop = pppol2tp_seq_stop, .show = pppol2tp_seq_show, }; #endif /* CONFIG_PROC_FS */ /***************************************************************************** * Network namespace *****************************************************************************/ static __net_init int pppol2tp_init_net(struct net *net) { struct proc_dir_entry *pde; int err = 0; pde = proc_create_net("pppol2tp", 0444, net->proc_net, &pppol2tp_seq_ops, sizeof(struct pppol2tp_seq_data)); if (!pde) { err = -ENOMEM; goto out; } out: return err; } static __net_exit void pppol2tp_exit_net(struct net *net) { remove_proc_entry("pppol2tp", net->proc_net); } static struct pernet_operations pppol2tp_net_ops = { .init = pppol2tp_init_net, .exit = pppol2tp_exit_net, }; /***************************************************************************** * Init and cleanup *****************************************************************************/ static const struct proto_ops pppol2tp_ops = { .family = AF_PPPOX, .owner = THIS_MODULE, .release = pppol2tp_release, .bind = sock_no_bind, .connect = pppol2tp_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppol2tp_getname, .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = pppol2tp_setsockopt, .getsockopt = pppol2tp_getsockopt, .sendmsg = pppol2tp_sendmsg, .recvmsg = pppol2tp_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = pppox_compat_ioctl, #endif }; static const struct pppox_proto pppol2tp_proto = { .create = pppol2tp_create, .ioctl = pppol2tp_ioctl, .owner = THIS_MODULE, }; #ifdef CONFIG_L2TP_V3 static const struct l2tp_nl_cmd_ops pppol2tp_nl_cmd_ops = { .session_create = pppol2tp_session_create, .session_delete = l2tp_session_delete, }; #endif /* CONFIG_L2TP_V3 */ static int __init pppol2tp_init(void) { int err; err = register_pernet_device(&pppol2tp_net_ops); if (err) goto out; err = proto_register(&pppol2tp_sk_proto, 0); if (err) goto out_unregister_pppol2tp_pernet; err = register_pppox_proto(PX_PROTO_OL2TP, &pppol2tp_proto); if (err) goto out_unregister_pppol2tp_proto; #ifdef CONFIG_L2TP_V3 err = l2tp_nl_register_ops(L2TP_PWTYPE_PPP, &pppol2tp_nl_cmd_ops); if (err) goto out_unregister_pppox; #endif pr_info("PPPoL2TP kernel driver, %s\n", PPPOL2TP_DRV_VERSION); out: return err; #ifdef CONFIG_L2TP_V3 out_unregister_pppox: unregister_pppox_proto(PX_PROTO_OL2TP); #endif out_unregister_pppol2tp_proto: proto_unregister(&pppol2tp_sk_proto); out_unregister_pppol2tp_pernet: unregister_pernet_device(&pppol2tp_net_ops); goto out; } static void __exit pppol2tp_exit(void) { #ifdef CONFIG_L2TP_V3 l2tp_nl_unregister_ops(L2TP_PWTYPE_PPP); #endif unregister_pppox_proto(PX_PROTO_OL2TP); proto_unregister(&pppol2tp_sk_proto); unregister_pernet_device(&pppol2tp_net_ops); } module_init(pppol2tp_init); module_exit(pppol2tp_exit); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("PPP over L2TP over UDP"); MODULE_LICENSE("GPL"); MODULE_VERSION(PPPOL2TP_DRV_VERSION); MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OL2TP); MODULE_ALIAS_L2TP_PWTYPE(7);
27 27 30 30 6 5 30 6 6 6 157 155 157 158 158 157 157 158 139 139 139 139 139 139 139 139 139 138 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. These functions are needed by GSO/GRO implementation. */ #include <linux/export.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/addrconf.h> #include <net/secure_seq.h> #include <linux/netfilter.h> static u32 __ipv6_select_ident(struct net *net, const struct in6_addr *dst, const struct in6_addr *src) { return get_random_u32_above(0); } /* This function exists only for tap drivers that must support broken * clients requesting UFO without specifying an IPv6 fragment ID. * * This is similar to ipv6_select_ident() but we use an independent hash * seed to limit information leakage. * * The network header must be set before calling this. */ __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) { struct in6_addr buf[2]; struct in6_addr *addrs; u32 id; addrs = skb_header_pointer(skb, skb_network_offset(skb) + offsetof(struct ipv6hdr, saddr), sizeof(buf), buf); if (!addrs) return 0; id = __ipv6_select_ident(net, &addrs[1], &addrs[0]); return htonl(id); } EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr) { u32 id; id = __ipv6_select_ident(net, daddr, saddr); return htonl(id); } EXPORT_SYMBOL(ipv6_select_ident); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { unsigned int offset = sizeof(struct ipv6hdr); unsigned int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; while (offset <= packet_len) { struct ipv6_opt_hdr *exthdr; switch (**nexthdr) { case NEXTHDR_HOP: break; case NEXTHDR_ROUTING: found_rhdr = 1; break; case NEXTHDR_DEST: #if IS_ENABLED(CONFIG_IPV6_MIP6) if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) break; #endif if (found_rhdr) return offset; break; default: return offset; } if (offset + sizeof(struct ipv6_opt_hdr) > packet_len) return -EINVAL; exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); offset += ipv6_optlen(exthdr); if (offset > IPV6_MAXPLEN) return -EINVAL; *nexthdr = &exthdr->nexthdr; } return -EINVAL; } EXPORT_SYMBOL(ip6_find_1stfragopt); #if IS_ENABLED(CONFIG_IPV6) int ip6_dst_hoplimit(struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); rcu_read_lock(); if (hoplimit == 0) { struct net_device *dev = dst_dev_rcu(dst); struct inet6_dev *idev; idev = __in6_dev_get(dev); if (idev) hoplimit = READ_ONCE(idev->cnf.hop_limit); else hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit); } rcu_read_unlock(); return hoplimit; } EXPORT_SYMBOL(ip6_dst_hoplimit); #endif int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int len; len = skb->len - sizeof(struct ipv6hdr); if (len > IPV6_MAXPLEN) len = 0; ipv6_hdr(skb)->payload_len = htons(len); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip6_out(sk, skb); if (unlikely(!skb)) return 0; skb->protocol = htons(ETH_P_IPV6); return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst_dev(skb), dst_output); } EXPORT_SYMBOL_GPL(__ip6_local_out); int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = __ip6_local_out(net, sk, skb); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } EXPORT_SYMBOL_GPL(ip6_local_out);
29 533 26 47 533 533 35 533 28 27 16 28 28 18 18 1 516 440 510 507 28 28 504 28 507 32 1 31 8 23 31 31 505 6 516 28 519 517 518 96 88 437 518 8 129 128 123 6 129 3 127 11 129 5 125 94 493 26 491 491 32 2 32 493 129 368 7 368 341 369 369 366 517 60 516 18 18 1 1 514 517 518 516 25 25 4 515 4 512 516 515 516 6 513 517 28 28 28 516 30 516 30 9 515 2 2 1 4 517 517 514 490 490 516 517 514 32 515 500 16 507 492 489 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/attr.c * * Copyright (C) 1991, 1992 Linus Torvalds * changes by Thomas Schoebel-Theuer */ #include <linux/export.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/sched/signal.h> #include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> #include <linux/filelock.h> #include <linux/security.h> /** * setattr_should_drop_sgid - determine whether the setgid bit needs to be * removed * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the setgid bit needs to be removed. * We retain backwards compatibility and require setgid bit to be removed * unconditionally if S_IXGRP is set. Otherwise we have the exact same * requirements as setattr_prepare() and setattr_copy(). * * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise. */ int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode) { umode_t mode = inode->i_mode; if (!(mode & S_ISGID)) return 0; if (mode & S_IXGRP) return ATTR_KILL_SGID; if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) return ATTR_KILL_SGID; return 0; } EXPORT_SYMBOL(setattr_should_drop_sgid); /** * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to * be dropped * @idmap: idmap of the mount @inode was found from * @inode: inode to check * * This function determines whether the set{g,u}id bits need to be removed. * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both * set{g,u}id bits need to be removed the corresponding mask of both flags is * returned. * * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits * to remove, 0 otherwise. */ int setattr_should_drop_suidgid(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; int kill = 0; /* suid always must be killed */ if (unlikely(mode & S_ISUID)) kill = ATTR_KILL_SUID; kill |= setattr_should_drop_sgid(idmap, inode); if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) return kill; return 0; } EXPORT_SYMBOL(setattr_should_drop_suidgid); /** * chown_ok - verify permissions to chown inode * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsuid: uid to chown @inode to * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ static bool chown_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsuid_t ia_vfsuid) { vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid()) && vfsuid_eq(ia_vfsuid, vfsuid)) return true; if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsuid_valid(vfsuid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; } /** * chgrp_ok - verify permissions to chgrp inode * @idmap: idmap of the mount @inode was found from * @inode: inode to check permissions on * @ia_vfsgid: gid to chown @inode to * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ static bool chgrp_ok(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t ia_vfsgid) { vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) { if (vfsgid_eq(ia_vfsgid, vfsgid)) return true; if (vfsgid_in_group_p(ia_vfsgid)) return true; } if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN)) return true; if (!vfsgid_valid(vfsgid) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; } /** * setattr_prepare - check if attribute changes to a dentry are allowed * @idmap: idmap of the mount the inode was found from * @dentry: dentry to check * @attr: attributes to change * * Check if we are allowed to change the attributes contained in @attr * in the given dentry. This includes the normal unix access permission * checks, as well as checks for rlimits and others. The function also clears * SGID bit from mode if user is not allowed to set it. Also file capabilities * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. */ int setattr_prepare(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; /* * First check size constraints. These can't be overriden using * ATTR_FORCE. */ if (ia_valid & ATTR_SIZE) { int error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; } /* If force is set do it anyway. */ if (ia_valid & ATTR_FORCE) goto kill_priv; /* Make sure a caller can chown. */ if ((ia_valid & ATTR_UID) && !chown_ok(idmap, inode, attr->ia_vfsuid)) return -EPERM; /* Make sure caller can chgrp. */ if ((ia_valid & ATTR_GID) && !chgrp_ok(idmap, inode, attr->ia_vfsgid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { vfsgid_t vfsgid; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (ia_valid & ATTR_GID) vfsgid = attr->ia_vfsgid; else vfsgid = i_gid_into_vfsgid(idmap, inode); /* Also check the setgid bit! */ if (!in_group_or_capable(idmap, inode, vfsgid)) attr->ia_mode &= ~S_ISGID; } /* Check for setting the inode time. */ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { if (!inode_owner_or_capable(idmap, inode)) return -EPERM; } kill_priv: /* User has permission for the change */ if (ia_valid & ATTR_KILL_PRIV) { int error; error = security_inode_killpriv(idmap, dentry); if (error) return error; } return 0; } EXPORT_SYMBOL(setattr_prepare); /** * inode_newsize_ok - may this inode be truncated to a given size * @inode: the inode to be truncated * @offset: the new size to assign to the inode * * inode_newsize_ok must be called with i_rwsem held exclusively. * * inode_newsize_ok will check filesystem limits and ulimits to check that the * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ * when necessary. Caller must not proceed with inode size change if failure is * returned. @inode must be a file (not directory), with appropriate * permissions to allow truncate (inode_newsize_ok does NOT check these * conditions). * * Return: 0 on success, -ve errno on failure */ int inode_newsize_ok(const struct inode *inode, loff_t offset) { if (offset < 0) return -EINVAL; if (inode->i_size < offset) { unsigned long limit; limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY && offset > limit) goto out_sig; if (offset > inode->i_sb->s_maxbytes) goto out_big; } else { /* * truncation of in-use swapfiles is disallowed - it would * cause subsequent swapout to scribble on the now-freed * blocks. */ if (IS_SWAPFILE(inode)) return -ETXTBSY; } return 0; out_sig: send_sig(SIGXFSZ, current, 0); out_big: return -EFBIG; } EXPORT_SYMBOL(inode_newsize_ok); /** * setattr_copy_mgtime - update timestamps for mgtime inodes * @inode: inode timestamps to be updated * @attr: attrs for the update * * With multigrain timestamps, take more care to prevent races when * updating the ctime. Always update the ctime to the very latest using * the standard mechanism, and use that to populate the atime and mtime * appropriately (unless those are being set to specific values). */ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; struct timespec64 now; if (ia_valid & ATTR_CTIME_SET) now = inode_set_ctime_deleg(inode, attr->ia_ctime); else if (ia_valid & ATTR_CTIME) now = inode_set_ctime_current(inode); else now = current_time(inode); if (ia_valid & ATTR_ATIME_SET) inode_set_atime_to_ts(inode, attr->ia_atime); else if (ia_valid & ATTR_ATIME) inode_set_atime_to_ts(inode, now); if (ia_valid & ATTR_MTIME_SET) inode_set_mtime_to_ts(inode, attr->ia_mtime); else if (ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, now); } /** * setattr_copy - copy simple metadata updates into the generic inode * @idmap: idmap of the mount the inode was found from * @inode: the inode to be updated * @attr: the new attributes * * setattr_copy must be called with i_rwsem held exclusively. * * setattr_copy updates the inode's metadata with that specified * in attr on idmapped mounts. Necessary permission checks to determine * whether or not the S_ISGID property needs to be removed are performed with * the correct idmapped mount permission helpers. * Noticeably missing is inode size update, which is more complex * as it requires pagecache updates. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * The inode is not marked as dirty after this operation. The rationale is * that for "simple" filesystems, the struct inode is the inode storage. * The caller is free to mark the inode dirty afterwards if needed. */ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; inode->i_mode = mode; } if (is_mgtime(inode)) return setattr_copy_mgtime(inode, attr); if (ia_valid & ATTR_ATIME) inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, attr->ia_mtime); if (ia_valid & ATTR_CTIME_SET) inode_set_ctime_deleg(inode, attr->ia_ctime); else if (ia_valid & ATTR_CTIME) inode_set_ctime_to_ts(inode, attr->ia_ctime); } EXPORT_SYMBOL(setattr_copy); int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid) { int error; if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; } /* * If utimes(2) and friends are called with times == NULL (or both * times are UTIME_NOW), then we need to check for write permission */ if (ia_valid & ATTR_TOUCH) { if (IS_IMMUTABLE(inode)) return -EPERM; if (!inode_owner_or_capable(idmap, inode)) { error = inode_permission(idmap, inode, MAY_WRITE); if (error) return error; } } return 0; } EXPORT_SYMBOL(may_setattr); /** * notify_change - modify attributes of a filesystem object * @idmap: idmap of the mount the inode was found from * @dentry: object affected * @attr: new attributes * @delegated_inode: returns inode, if the inode is delegated * * The caller must hold the i_rwsem exclusively on the affected object. * * If notify_change discovers a delegation in need of breaking, * it will return -EWOULDBLOCK and return a reference to the inode in * delegated_inode. The caller should then break the delegation and * retry. Because breaking a delegation may take a long time, the * caller should drop the i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. Also, passing NULL is fine for callers holding * the file open for write, as there can be no conflicting delegation in * that case. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr, struct delegated_inode *delegated_inode) { struct inode *inode = dentry->d_inode; umode_t mode = inode->i_mode; int error; struct timespec64 now; unsigned int ia_valid = attr->ia_valid; WARN_ON_ONCE(!inode_is_locked(inode)); error = may_setattr(idmap, inode, ia_valid); if (error) return error; if ((ia_valid & ATTR_MODE)) { /* * Don't allow changing the mode of symlinks: * * (1) The vfs doesn't take the mode of symlinks into account * during permission checking. * (2) This has never worked correctly. Most major filesystems * did return EOPNOTSUPP due to interactions with POSIX ACLs * but did still updated the mode of the symlink. * This inconsistency led system call wrapper providers such * as libc to block changing the mode of symlinks with * EOPNOTSUPP already. * (3) To even do this in the first place one would have to use * specific file descriptors and quite some effort. */ if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; /* Flag setting protected by i_rwsem */ if (is_sxid(attr->ia_mode)) inode->i_flags &= ~S_NOSEC; } now = current_time(inode); if (ia_valid & ATTR_ATIME_SET) attr->ia_atime = timestamp_truncate(attr->ia_atime, inode); else attr->ia_atime = now; if (ia_valid & ATTR_CTIME_SET) attr->ia_ctime = timestamp_truncate(attr->ia_ctime, inode); else attr->ia_ctime = now; if (ia_valid & ATTR_MTIME_SET) attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode); else attr->ia_mtime = now; if (ia_valid & ATTR_KILL_PRIV) { error = security_inode_need_killpriv(dentry); if (error < 0) return error; if (error == 0) ia_valid = attr->ia_valid &= ~ATTR_KILL_PRIV; } /* * We now pass ATTR_KILL_S*ID to the lower level setattr function so * that the function has the ability to reinterpret a mode change * that's due to these bits. This adds an implicit restriction that * no function will ever call notify_change with both ATTR_MODE and * ATTR_KILL_S*ID set. */ if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) && (ia_valid & ATTR_MODE)) BUG(); if (ia_valid & ATTR_KILL_SUID) { if (mode & S_ISUID) { ia_valid = attr->ia_valid |= ATTR_MODE; attr->ia_mode = (inode->i_mode & ~S_ISUID); } } if (ia_valid & ATTR_KILL_SGID) { if (mode & S_ISGID) { if (!(ia_valid & ATTR_MODE)) { ia_valid = attr->ia_valid |= ATTR_MODE; attr->ia_mode = inode->i_mode; } attr->ia_mode &= ~S_ISGID; } } if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID))) return 0; /* * Verify that uid/gid changes are valid in the target * namespace of the superblock. */ if (ia_valid & ATTR_UID && !vfsuid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsuid)) return -EOVERFLOW; if (ia_valid & ATTR_GID && !vfsgid_has_fsmapping(idmap, inode->i_sb->s_user_ns, attr->ia_vfsgid)) return -EOVERFLOW; /* Don't allow modifications of files with invalid uids or * gids unless those uids & gids are being made valid. */ if (!(ia_valid & ATTR_UID) && !vfsuid_valid(i_uid_into_vfsuid(idmap, inode))) return -EOVERFLOW; if (!(ia_valid & ATTR_GID) && !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; error = security_inode_setattr(idmap, dentry, attr); if (error) return error; /* * If ATTR_DELEG is set, then these attributes are being set on * behalf of the holder of a write delegation. We want to avoid * breaking the delegation in this case. */ if (!(ia_valid & ATTR_DELEG)) { error = try_break_deleg(inode, delegated_inode); if (error) return error; } if (inode->i_op->setattr) error = inode->i_op->setattr(idmap, dentry, attr); else error = simple_setattr(idmap, dentry, attr); if (!error) { fsnotify_change(dentry, ia_valid); security_inode_post_setattr(idmap, dentry, ia_valid); } return error; } EXPORT_SYMBOL(notify_change);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel implementation * (C) Copyright 2007 Hewlett-Packard Development Company, L.P. * * This file is part of the SCTP kernel implementation * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Vlad Yasevich <vladislav.yasevich@hp.com> */ #ifndef __sctp_auth_h__ #define __sctp_auth_h__ #include <linux/list.h> #include <linux/refcount.h> struct sctp_endpoint; struct sctp_association; struct sctp_authkey; struct sctp_hmacalgo; /* Defines an HMAC algorithm supported by SCTP chunk authentication */ struct sctp_hmac { __u16 hmac_id; /* one of SCTP_AUTH_HMAC_ID_* */ __u16 hmac_len; /* length of the HMAC value in bytes */ }; /* This is generic structure that containst authentication bytes used * as keying material. It's a what is referred to as byte-vector all * over SCTP-AUTH */ struct sctp_auth_bytes { refcount_t refcnt; __u32 len; __u8 data[]; }; /* Definition for a shared key, weather endpoint or association */ struct sctp_shared_key { struct list_head key_list; struct sctp_auth_bytes *key; refcount_t refcnt; __u16 key_id; __u8 deactivated; }; #define key_for_each(__key, __list_head) \ list_for_each_entry(__key, __list_head, key_list) #define key_for_each_safe(__key, __tmp, __list_head) \ list_for_each_entry_safe(__key, __tmp, __list_head, key_list) static inline void sctp_auth_key_hold(struct sctp_auth_bytes *key) { if (!key) return; refcount_inc(&key->refcnt); } void sctp_auth_key_put(struct sctp_auth_bytes *key); struct sctp_shared_key *sctp_auth_shkey_create(__u16 key_id, gfp_t gfp); void sctp_auth_destroy_keys(struct list_head *keys); int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp); struct sctp_shared_key *sctp_auth_get_shkey( const struct sctp_association *asoc, __u16 key_id); int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep, struct sctp_association *asoc, gfp_t gfp); const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id); const struct sctp_hmac * sctp_auth_asoc_get_hmac(const struct sctp_association *asoc); void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs); int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, __be16 hmac_id); int sctp_auth_send_cid(enum sctp_cid chunk, const struct sctp_association *asoc); int sctp_auth_recv_cid(enum sctp_cid chunk, const struct sctp_association *asoc); void sctp_auth_calculate_hmac(const struct sctp_association *asoc, struct sk_buff *skb, struct sctp_auth_chunk *auth, struct sctp_shared_key *ep_key, gfp_t gfp); void sctp_auth_shkey_release(struct sctp_shared_key *sh_key); void sctp_auth_shkey_hold(struct sctp_shared_key *sh_key); /* API Helpers */ int sctp_auth_ep_add_chunkid(struct sctp_endpoint *ep, __u8 chunk_id); int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, struct sctp_hmacalgo *hmacs); int sctp_auth_set_key(struct sctp_endpoint *ep, struct sctp_association *asoc, struct sctp_authkey *auth_key); int sctp_auth_set_active_key(struct sctp_endpoint *ep, struct sctp_association *asoc, __u16 key_id); int sctp_auth_del_key_id(struct sctp_endpoint *ep, struct sctp_association *asoc, __u16 key_id); int sctp_auth_deact_key_id(struct sctp_endpoint *ep, struct sctp_association *asoc, __u16 key_id); int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp); void sctp_auth_free(struct sctp_endpoint *ep); #endif
63 63 62 65 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 // SPDX-License-Identifier: GPL-2.0+ /** * DOC: vkms (Virtual Kernel Modesetting) * * VKMS is a software-only model of a KMS driver that is useful for testing * and for running X (or similar) on headless machines. VKMS aims to enable * a virtual display with no need of a hardware display capability, releasing * the GPU in DRM API tests. */ #include <linux/module.h> #include <linux/device/faux.h> #include <linux/dma-mapping.h> #include <drm/clients/drm_client_setup.h> #include <drm/drm_gem.h> #include <drm/drm_atomic.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_drv.h> #include <drm/drm_fbdev_shmem.h> #include <drm/drm_file.h> #include <drm/drm_gem_framebuffer_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_print.h> #include <drm/drm_probe_helper.h> #include <drm/drm_gem_shmem_helper.h> #include <drm/drm_vblank.h> #include "vkms_config.h" #include "vkms_configfs.h" #include "vkms_drv.h" #define DRIVER_NAME "vkms" #define DRIVER_DESC "Virtual Kernel Mode Setting" #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0 static struct vkms_config *default_config; static bool enable_cursor = true; module_param_named(enable_cursor, enable_cursor, bool, 0444); MODULE_PARM_DESC(enable_cursor, "Enable/Disable cursor support"); static bool enable_writeback = true; module_param_named(enable_writeback, enable_writeback, bool, 0444); MODULE_PARM_DESC(enable_writeback, "Enable/Disable writeback connector support"); static bool enable_overlay; module_param_named(enable_overlay, enable_overlay, bool, 0444); MODULE_PARM_DESC(enable_overlay, "Enable/Disable overlay support"); static bool enable_plane_pipeline; module_param_named(enable_plane_pipeline, enable_plane_pipeline, bool, 0444); MODULE_PARM_DESC(enable_plane_pipeline, "Enable/Disable plane pipeline support"); static bool create_default_dev = true; module_param_named(create_default_dev, create_default_dev, bool, 0444); MODULE_PARM_DESC(create_default_dev, "Create or not the default VKMS device"); DEFINE_DRM_GEM_FOPS(vkms_driver_fops); static void vkms_atomic_commit_tail(struct drm_atomic_state *old_state) { struct drm_device *dev = old_state->dev; struct drm_crtc *crtc; struct drm_crtc_state *old_crtc_state; int i; drm_atomic_helper_commit_modeset_disables(dev, old_state); drm_atomic_helper_commit_planes(dev, old_state, 0); drm_atomic_helper_commit_modeset_enables(dev, old_state); drm_atomic_helper_fake_vblank(old_state); drm_atomic_helper_commit_hw_done(old_state); drm_atomic_helper_wait_for_flip_done(dev, old_state); for_each_old_crtc_in_state(old_state, crtc, old_crtc_state, i) { struct vkms_crtc_state *vkms_state = to_vkms_crtc_state(old_crtc_state); flush_work(&vkms_state->composer_work); } drm_atomic_helper_cleanup_planes(dev, old_state); } static const struct drm_driver vkms_driver = { .driver_features = DRIVER_MODESET | DRIVER_ATOMIC | DRIVER_GEM, .fops = &vkms_driver_fops, DRM_GEM_SHMEM_DRIVER_OPS, DRM_FBDEV_SHMEM_DRIVER_OPS, .name = DRIVER_NAME, .desc = DRIVER_DESC, .major = DRIVER_MAJOR, .minor = DRIVER_MINOR, }; static int vkms_atomic_check(struct drm_device *dev, struct drm_atomic_state *state) { struct drm_crtc *crtc; struct drm_crtc_state *new_crtc_state; int i; for_each_new_crtc_in_state(state, crtc, new_crtc_state, i) { if (!new_crtc_state->gamma_lut || !new_crtc_state->color_mgmt_changed) continue; if (new_crtc_state->gamma_lut->length / sizeof(struct drm_color_lut *) > VKMS_LUT_SIZE) return -EINVAL; } return drm_atomic_helper_check(dev, state); } static const struct drm_mode_config_funcs vkms_mode_funcs = { .fb_create = drm_gem_fb_create, .atomic_check = vkms_atomic_check, .atomic_commit = drm_atomic_helper_commit, }; static const struct drm_mode_config_helper_funcs vkms_mode_config_helpers = { .atomic_commit_tail = vkms_atomic_commit_tail, }; static int vkms_modeset_init(struct vkms_device *vkmsdev) { struct drm_device *dev = &vkmsdev->drm; int ret; ret = drmm_mode_config_init(dev); if (ret) return ret; dev->mode_config.funcs = &vkms_mode_funcs; dev->mode_config.min_width = XRES_MIN; dev->mode_config.min_height = YRES_MIN; dev->mode_config.max_width = XRES_MAX; dev->mode_config.max_height = YRES_MAX; dev->mode_config.cursor_width = 512; dev->mode_config.cursor_height = 512; /* * FIXME: There's a confusion between bpp and depth between this and * fbdev helpers. We have to go with 0, meaning "pick the default", * which is XRGB8888 in all cases. */ dev->mode_config.preferred_depth = 0; dev->mode_config.helper_private = &vkms_mode_config_helpers; return vkms_output_init(vkmsdev); } int vkms_create(struct vkms_config *config) { int ret; struct faux_device *fdev; struct vkms_device *vkms_device; const char *dev_name; dev_name = vkms_config_get_device_name(config); fdev = faux_device_create(dev_name, NULL, NULL); if (!fdev) return -ENODEV; if (!devres_open_group(&fdev->dev, NULL, GFP_KERNEL)) { ret = -ENOMEM; goto out_unregister; } vkms_device = devm_drm_dev_alloc(&fdev->dev, &vkms_driver, struct vkms_device, drm); if (IS_ERR(vkms_device)) { ret = PTR_ERR(vkms_device); goto out_devres; } vkms_device->faux_dev = fdev; vkms_device->config = config; config->dev = vkms_device; ret = dma_coerce_mask_and_coherent(vkms_device->drm.dev, DMA_BIT_MASK(64)); if (ret) { DRM_ERROR("Could not initialize DMA support\n"); goto out_devres; } ret = drm_vblank_init(&vkms_device->drm, vkms_config_get_num_crtcs(config)); if (ret) { DRM_ERROR("Failed to vblank\n"); goto out_devres; } ret = vkms_modeset_init(vkms_device); if (ret) goto out_devres; vkms_config_register_debugfs(vkms_device); ret = drm_dev_register(&vkms_device->drm, 0); if (ret) goto out_devres; drm_client_setup(&vkms_device->drm, NULL); return 0; out_devres: devres_release_group(&fdev->dev, NULL); out_unregister: faux_device_destroy(fdev); return ret; } static int __init vkms_init(void) { int ret; struct vkms_config *config; ret = vkms_configfs_register(); if (ret) return ret; if (!create_default_dev) return 0; config = vkms_config_default_create(enable_cursor, enable_writeback, enable_overlay, enable_plane_pipeline); if (IS_ERR(config)) return PTR_ERR(config); ret = vkms_create(config); if (ret) { vkms_config_destroy(config); return ret; } default_config = config; return 0; } void vkms_destroy(struct vkms_config *config) { struct faux_device *fdev; if (!config->dev) { DRM_INFO("vkms_device is NULL.\n"); return; } fdev = config->dev->faux_dev; drm_colorop_pipeline_destroy(&config->dev->drm); drm_dev_unregister(&config->dev->drm); drm_atomic_helper_shutdown(&config->dev->drm); devres_release_group(&fdev->dev, NULL); faux_device_destroy(fdev); config->dev = NULL; } static void __exit vkms_exit(void) { vkms_configfs_unregister(); if (!default_config) return; vkms_destroy(default_config); vkms_config_destroy(default_config); } module_init(vkms_init); module_exit(vkms_exit); MODULE_AUTHOR("Haneen Mohammed <hamohammed.sa@gmail.com>"); MODULE_AUTHOR("Rodrigo Siqueira <rodrigosiqueiramelo@gmail.com>"); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL");
6 6 6 1 1 6 3 3 1 1 1 8 7 7 6 6 6 5 3 3 3 3 3 7 8 3 3 3 3 3 1 1 1 6 3 6 6 5 5 5 5 3 3 3 3 6 7 3 3 486 1 486 486 486 2 2 2 486 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 /* * Mapping of UID/GIDs to name and vice versa. * * Copyright (c) 2002, 2003 The Regents of the University of * Michigan. All rights reserved. * * Marius Aamodt Eriksen <marius@umich.edu> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/sunrpc/svc_xprt.h> #include <net/net_namespace.h> #include "idmap.h" #include "nfsd.h" #include "netns.h" #include "vfs.h" /* * Turn off idmapping when using AUTH_SYS. */ static bool nfs4_disable_idmapping = true; module_param(nfs4_disable_idmapping, bool, 0644); MODULE_PARM_DESC(nfs4_disable_idmapping, "Turn off server's NFSv4 idmapping when using 'sec=sys'"); /* * Cache entry */ /* * XXX we know that IDMAP_NAMESZ < PAGE_SIZE, but it's ugly to rely on * that. */ struct ent { struct cache_head h; int type; /* User / Group */ u32 id; char name[IDMAP_NAMESZ]; char authname[IDMAP_NAMESZ]; struct rcu_head rcu_head; }; /* Common entry handling */ #define ENT_HASHBITS 8 #define ENT_HASHMAX (1 << ENT_HASHBITS) static void ent_init(struct cache_head *cnew, struct cache_head *citm) { struct ent *new = container_of(cnew, struct ent, h); struct ent *itm = container_of(citm, struct ent, h); new->id = itm->id; new->type = itm->type; strscpy(new->name, itm->name, sizeof(new->name)); strscpy(new->authname, itm->authname, sizeof(new->authname)); } static void ent_put(struct kref *ref) { struct ent *map = container_of(ref, struct ent, h.ref); kfree_rcu(map, rcu_head); } static struct cache_head * ent_alloc(void) { struct ent *e = kmalloc(sizeof(*e), GFP_KERNEL); if (e) return &e->h; else return NULL; } /* * ID -> Name cache */ static uint32_t idtoname_hash(struct ent *ent) { uint32_t hash; hash = hash_str(ent->authname, ENT_HASHBITS); hash = hash_long(hash ^ ent->id, ENT_HASHBITS); /* Flip LSB for user/group */ if (ent->type == IDMAP_TYPE_GROUP) hash ^= 1; return hash; } static int idtoname_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall_timeout(cd, h); } static void idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, int *blen) { struct ent *ent = container_of(ch, struct ent, h); char idstr[11]; qword_add(bpp, blen, ent->authname); snprintf(idstr, sizeof(idstr), "%u", ent->id); qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user"); qword_add(bpp, blen, idstr); (*bpp)[-1] = '\n'; } static int idtoname_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); struct ent *b = container_of(cb, struct ent, h); return (a->id == b->id && a->type == b->type && strcmp(a->authname, b->authname) == 0); } static int idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct ent *ent; if (h == NULL) { seq_puts(m, "#domain type id [name]\n"); return 0; } ent = container_of(h, struct ent, h); seq_printf(m, "%s %s %u", ent->authname, ent->type == IDMAP_TYPE_GROUP ? "group" : "user", ent->id); if (test_bit(CACHE_VALID, &h->flags)) seq_printf(m, " %s", ent->name); seq_putc(m, '\n'); return 0; } static void warn_no_idmapd(struct cache_detail *detail, int has_died) { printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n", has_died ? "died" : "not been started"); } static int idtoname_parse(struct cache_detail *, char *, int); static struct ent *idtoname_lookup(struct cache_detail *, struct ent *); static struct ent *idtoname_update(struct cache_detail *, struct ent *, struct ent *); static const struct cache_detail idtoname_cache_template = { .owner = THIS_MODULE, .hash_size = ENT_HASHMAX, .name = "nfs4.idtoname", .cache_put = ent_put, .cache_upcall = idtoname_upcall, .cache_request = idtoname_request, .cache_parse = idtoname_parse, .cache_show = idtoname_show, .warn_no_listener = warn_no_idmapd, .match = idtoname_match, .init = ent_init, .update = ent_init, .alloc = ent_alloc, }; static int idtoname_parse(struct cache_detail *cd, char *buf, int buflen) { struct ent ent, *res; char *buf1, *bp; int len; int error = -EINVAL; if (buf[buflen - 1] != '\n') return (-EINVAL); buf[buflen - 1]= '\0'; buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL); if (buf1 == NULL) return (-ENOMEM); memset(&ent, 0, sizeof(ent)); /* Authentication name */ len = qword_get(&buf, buf1, PAGE_SIZE); if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.authname, buf1, sizeof(ent.authname)); /* Type */ if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) goto out; ent.type = strcmp(buf1, "user") == 0 ? IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; /* ID */ if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) goto out; ent.id = simple_strtoul(buf1, &bp, 10); if (bp == buf1) goto out; /* expiry */ error = get_expiry(&buf, &ent.h.expiry_time); if (error) goto out; error = -ENOMEM; res = idtoname_lookup(cd, &ent); if (!res) goto out; /* Name */ error = -EINVAL; len = qword_get(&buf, buf1, PAGE_SIZE); if (len < 0 || len >= IDMAP_NAMESZ) goto out; if (len == 0) set_bit(CACHE_NEGATIVE, &ent.h.flags); else memcpy(ent.name, buf1, sizeof(ent.name)); error = -ENOMEM; res = idtoname_update(cd, &ent, res); if (res == NULL) goto out; cache_put(&res->h, cd); error = 0; out: kfree(buf1); return error; } static struct ent * idtoname_lookup(struct cache_detail *cd, struct ent *item) { struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h, idtoname_hash(item)); if (ch) return container_of(ch, struct ent, h); else return NULL; } static struct ent * idtoname_update(struct cache_detail *cd, struct ent *new, struct ent *old) { struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h, idtoname_hash(new)); if (ch) return container_of(ch, struct ent, h); else return NULL; } /* * Name -> ID cache */ static inline int nametoid_hash(struct ent *ent) { return hash_str(ent->name, ENT_HASHBITS); } static int nametoid_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall_timeout(cd, h); } static void nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, int *blen) { struct ent *ent = container_of(ch, struct ent, h); qword_add(bpp, blen, ent->authname); qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user"); qword_add(bpp, blen, ent->name); (*bpp)[-1] = '\n'; } static int nametoid_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); struct ent *b = container_of(cb, struct ent, h); return (a->type == b->type && strcmp(a->name, b->name) == 0 && strcmp(a->authname, b->authname) == 0); } static int nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct ent *ent; if (h == NULL) { seq_puts(m, "#domain type name [id]\n"); return 0; } ent = container_of(h, struct ent, h); seq_printf(m, "%s %s %s", ent->authname, ent->type == IDMAP_TYPE_GROUP ? "group" : "user", ent->name); if (test_bit(CACHE_VALID, &h->flags)) seq_printf(m, " %u", ent->id); seq_putc(m, '\n'); return 0; } static struct ent *nametoid_lookup(struct cache_detail *, struct ent *); static struct ent *nametoid_update(struct cache_detail *, struct ent *, struct ent *); static int nametoid_parse(struct cache_detail *, char *, int); static const struct cache_detail nametoid_cache_template = { .owner = THIS_MODULE, .hash_size = ENT_HASHMAX, .name = "nfs4.nametoid", .cache_put = ent_put, .cache_upcall = nametoid_upcall, .cache_request = nametoid_request, .cache_parse = nametoid_parse, .cache_show = nametoid_show, .warn_no_listener = warn_no_idmapd, .match = nametoid_match, .init = ent_init, .update = ent_init, .alloc = ent_alloc, }; static int nametoid_parse(struct cache_detail *cd, char *buf, int buflen) { struct ent ent, *res; char *buf1; int len, error = -EINVAL; if (buf[buflen - 1] != '\n') return (-EINVAL); buf[buflen - 1]= '\0'; buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL); if (buf1 == NULL) return (-ENOMEM); memset(&ent, 0, sizeof(ent)); /* Authentication name */ len = qword_get(&buf, buf1, PAGE_SIZE); if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.authname, buf1, sizeof(ent.authname)); /* Type */ if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) goto out; ent.type = strcmp(buf1, "user") == 0 ? IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; /* Name */ len = qword_get(&buf, buf1, PAGE_SIZE); if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.name, buf1, sizeof(ent.name)); /* expiry */ error = get_expiry(&buf, &ent.h.expiry_time); if (error) goto out; /* ID */ error = get_int(&buf, &ent.id); if (error == -EINVAL) goto out; if (error == -ENOENT) set_bit(CACHE_NEGATIVE, &ent.h.flags); error = -ENOMEM; res = nametoid_lookup(cd, &ent); if (res == NULL) goto out; res = nametoid_update(cd, &ent, res); if (res == NULL) goto out; cache_put(&res->h, cd); error = 0; out: kfree(buf1); return (error); } static struct ent * nametoid_lookup(struct cache_detail *cd, struct ent *item) { struct cache_head *ch = sunrpc_cache_lookup_rcu(cd, &item->h, nametoid_hash(item)); if (ch) return container_of(ch, struct ent, h); else return NULL; } static struct ent * nametoid_update(struct cache_detail *cd, struct ent *new, struct ent *old) { struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h, nametoid_hash(new)); if (ch) return container_of(ch, struct ent, h); else return NULL; } /* * Exported API */ int nfsd_idmap_init(struct net *net) { int rv; struct nfsd_net *nn = net_generic(net, nfsd_net_id); nn->idtoname_cache = cache_create_net(&idtoname_cache_template, net); if (IS_ERR(nn->idtoname_cache)) return PTR_ERR(nn->idtoname_cache); rv = cache_register_net(nn->idtoname_cache, net); if (rv) goto destroy_idtoname_cache; nn->nametoid_cache = cache_create_net(&nametoid_cache_template, net); if (IS_ERR(nn->nametoid_cache)) { rv = PTR_ERR(nn->nametoid_cache); goto unregister_idtoname_cache; } rv = cache_register_net(nn->nametoid_cache, net); if (rv) goto destroy_nametoid_cache; return 0; destroy_nametoid_cache: cache_destroy_net(nn->nametoid_cache, net); unregister_idtoname_cache: cache_unregister_net(nn->idtoname_cache, net); destroy_idtoname_cache: cache_destroy_net(nn->idtoname_cache, net); return rv; } void nfsd_idmap_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); cache_unregister_net(nn->idtoname_cache, net); cache_unregister_net(nn->nametoid_cache, net); cache_destroy_net(nn->idtoname_cache, net); cache_destroy_net(nn->nametoid_cache, net); } static int idmap_lookup(struct svc_rqst *rqstp, struct ent *(*lookup_fn)(struct cache_detail *, struct ent *), struct ent *key, struct cache_detail *detail, struct ent **item) { int ret; *item = lookup_fn(detail, key); if (!*item) return -ENOMEM; retry: ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle); if (ret == -ETIMEDOUT) { struct ent *prev_item = *item; *item = lookup_fn(detail, key); if (*item != prev_item) goto retry; cache_put(&(*item)->h, detail); } return ret; } static char * rqst_authname(struct svc_rqst *rqstp) { struct auth_domain *clp; clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client; return clp->name; } static __be32 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) { struct ent *item, key = { .type = type, }; int ret; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (namelen + 1 > sizeof(key.name)) return nfserr_badowner; memcpy(key.name, name, namelen); key.name[namelen] = '\0'; strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item); if (ret == -ENOENT) return nfserr_badowner; if (ret) return nfserrno(ret); *id = item->id; cache_put(&item->h, nn->nametoid_cache); return 0; } static __be32 encode_ascii_id(struct xdr_stream *xdr, u32 id) { char buf[11]; int len; __be32 *p; len = sprintf(buf, "%u", id); p = xdr_reserve_space(xdr, len + 4); if (!p) return nfserr_resource; p = xdr_encode_opaque(p, buf, len); return 0; } static __be32 idmap_id_to_name(struct xdr_stream *xdr, struct svc_rqst *rqstp, int type, u32 id) { struct ent *item, key = { .id = id, .type = type, }; __be32 status = nfs_ok; __be32 *p; int ret; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); if (ret == -ENOENT) return encode_ascii_id(xdr, id); if (ret) return nfserrno(ret); ret = strlen(item->name); WARN_ON_ONCE(ret > IDMAP_NAMESZ); p = xdr_reserve_space(xdr, ret + 4); if (unlikely(!p)) { status = nfserr_resource; goto out_put; } xdr_encode_opaque(p, item->name, ret); out_put: cache_put(&item->h, nn->idtoname_cache); return status; } static bool numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) { int ret; char buf[11]; if (namelen + 1 > sizeof(buf)) /* too long to represent a 32-bit id: */ return false; /* Just to make sure it's null-terminated: */ memcpy(buf, name, namelen); buf[namelen] = '\0'; ret = kstrtouint(buf, 10, id); return ret == 0; } static __be32 do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) if (numeric_name_to_id(rqstp, type, name, namelen, id)) return 0; /* * otherwise, fall through and try idmapping, for * backwards compatibility with clients sending names: */ return idmap_name_to_id(rqstp, type, name, namelen, id); } static __be32 encode_name_from_id(struct xdr_stream *xdr, struct svc_rqst *rqstp, int type, u32 id) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) return encode_ascii_id(xdr, id); return idmap_id_to_name(xdr, rqstp, type, id); } __be32 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, kuid_t *uid) { __be32 status; u32 id = -1; if (name == NULL || namelen == 0) return nfserr_inval; status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id); *uid = make_kuid(nfsd_user_namespace(rqstp), id); if (!uid_valid(*uid)) status = nfserr_badowner; return status; } __be32 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, kgid_t *gid) { __be32 status; u32 id = -1; if (name == NULL || namelen == 0) return nfserr_inval; status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id); *gid = make_kgid(nfsd_user_namespace(rqstp), id); if (!gid_valid(*gid)) status = nfserr_badowner; return status; } __be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp, kuid_t uid) { u32 id = from_kuid_munged(nfsd_user_namespace(rqstp), uid); return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id); } __be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp, kgid_t gid) { u32 id = from_kgid_munged(nfsd_user_namespace(rqstp), gid); return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id); }
479 483 481 483 483 480 7 481 482 2 2 2 2 2 2 2 7 7 7 7 2 9 2 7 8 9 9 482 482 482 481 14 483 481 480 482 482 483 483 482 483 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Politecnico di Torino, Italy * TORSEC group -- https://security.polito.it * * Author: Roberto Sassu <roberto.sassu@polito.it> * * File: ima_template_lib.c * Library of supported template fields. */ #include "ima_template_lib.h" #include <linux/xattr.h> #include <linux/evm.h> static bool ima_template_hash_algo_allowed(u8 algo) { if (algo == HASH_ALGO_SHA1 || algo == HASH_ALGO_MD5) return true; return false; } enum data_formats { DATA_FMT_DIGEST = 0, DATA_FMT_DIGEST_WITH_ALGO, DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO, DATA_FMT_STRING, DATA_FMT_HEX, DATA_FMT_UINT }; enum digest_type { DIGEST_TYPE_IMA, DIGEST_TYPE_VERITY, DIGEST_TYPE__LAST }; #define DIGEST_TYPE_NAME_LEN_MAX 7 /* including NUL */ static const char * const digest_type_name[DIGEST_TYPE__LAST] = { [DIGEST_TYPE_IMA] = "ima", [DIGEST_TYPE_VERITY] = "verity" }; static int ima_write_template_field_data(const void *data, const u32 datalen, enum data_formats datafmt, struct ima_field_data *field_data) { u8 *buf, *buf_ptr; u32 buflen = datalen; if (datafmt == DATA_FMT_STRING) buflen = datalen + 1; buf = kzalloc(buflen, GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, data, datalen); /* * Replace all space characters with underscore for event names and * strings. This avoid that, during the parsing of a measurements list, * filenames with spaces or that end with the suffix ' (deleted)' are * split into multiple template fields (the space is the delimitator * character for measurements lists in ASCII format). */ if (datafmt == DATA_FMT_STRING) { for (buf_ptr = buf; buf_ptr - buf < datalen; buf_ptr++) if (*buf_ptr == ' ') *buf_ptr = '_'; } field_data->data = buf; field_data->len = buflen; return 0; } static void ima_show_template_data_ascii(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { u8 *buf_ptr = field_data->data; u32 buflen = field_data->len; switch (datafmt) { case DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO: case DATA_FMT_DIGEST_WITH_ALGO: buf_ptr = strrchr(field_data->data, ':'); if (buf_ptr != field_data->data) seq_printf(m, "%s", field_data->data); /* skip ':' and '\0' */ buf_ptr += 2; buflen -= buf_ptr - field_data->data; fallthrough; case DATA_FMT_DIGEST: case DATA_FMT_HEX: if (!buflen) break; ima_print_digest(m, buf_ptr, buflen); break; case DATA_FMT_STRING: seq_printf(m, "%s", buf_ptr); break; case DATA_FMT_UINT: switch (field_data->len) { case sizeof(u8): seq_printf(m, "%u", *(u8 *)buf_ptr); break; case sizeof(u16): if (ima_canonical_fmt) seq_printf(m, "%u", le16_to_cpu(*(__le16 *)buf_ptr)); else seq_printf(m, "%u", *(u16 *)buf_ptr); break; case sizeof(u32): if (ima_canonical_fmt) seq_printf(m, "%u", le32_to_cpu(*(__le32 *)buf_ptr)); else seq_printf(m, "%u", *(u32 *)buf_ptr); break; case sizeof(u64): if (ima_canonical_fmt) seq_printf(m, "%llu", le64_to_cpu(*(__le64 *)buf_ptr)); else seq_printf(m, "%llu", *(u64 *)buf_ptr); break; default: break; } break; default: break; } } static void ima_show_template_data_binary(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { u32 len = (show == IMA_SHOW_BINARY_OLD_STRING_FMT) ? strlen(field_data->data) : field_data->len; if (show != IMA_SHOW_BINARY_NO_FIELD_LEN) { u32 field_len = !ima_canonical_fmt ? len : (__force u32)cpu_to_le32(len); ima_putc(m, &field_len, sizeof(field_len)); } if (!len) return; ima_putc(m, field_data->data, len); } static void ima_show_template_field_data(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { switch (show) { case IMA_SHOW_ASCII: ima_show_template_data_ascii(m, show, datafmt, field_data); break; case IMA_SHOW_BINARY: case IMA_SHOW_BINARY_NO_FIELD_LEN: case IMA_SHOW_BINARY_OLD_STRING_FMT: ima_show_template_data_binary(m, show, datafmt, field_data); break; default: break; } } void ima_show_template_digest(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST, field_data); } void ima_show_template_digest_ng(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST_WITH_ALGO, field_data); } void ima_show_template_digest_ngv2(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO, field_data); } void ima_show_template_string(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_STRING, field_data); } void ima_show_template_sig(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_HEX, field_data); } void ima_show_template_buf(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_HEX, field_data); } void ima_show_template_uint(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_UINT, field_data); } /** * ima_parse_buf() - Parses lengths and data from an input buffer * @bufstartp: Buffer start address. * @bufendp: Buffer end address. * @bufcurp: Pointer to remaining (non-parsed) data. * @maxfields: Length of fields array. * @fields: Array containing lengths and pointers of parsed data. * @curfields: Number of array items containing parsed data. * @len_mask: Bitmap (if bit is set, data length should not be parsed). * @enforce_mask: Check if curfields == maxfields and/or bufcurp == bufendp. * @bufname: String identifier of the input buffer. * * Return: 0 on success, -EINVAL on error. */ int ima_parse_buf(void *bufstartp, void *bufendp, void **bufcurp, int maxfields, struct ima_field_data *fields, int *curfields, unsigned long *len_mask, int enforce_mask, char *bufname) { void *bufp = bufstartp; int i; for (i = 0; i < maxfields; i++) { if (len_mask == NULL || !test_bit(i, len_mask)) { if (bufp > (bufendp - sizeof(u32))) break; if (ima_canonical_fmt) fields[i].len = le32_to_cpu(*(__le32 *)bufp); else fields[i].len = *(u32 *)bufp; bufp += sizeof(u32); } if (bufp > (bufendp - fields[i].len)) break; fields[i].data = bufp; bufp += fields[i].len; } if ((enforce_mask & ENFORCE_FIELDS) && i != maxfields) { pr_err("%s: nr of fields mismatch: expected: %d, current: %d\n", bufname, maxfields, i); return -EINVAL; } if ((enforce_mask & ENFORCE_BUFEND) && bufp != bufendp) { pr_err("%s: buf end mismatch: expected: %p, current: %p\n", bufname, bufendp, bufp); return -EINVAL; } if (curfields) *curfields = i; if (bufcurp) *bufcurp = bufp; return 0; } static int ima_eventdigest_init_common(const u8 *digest, u32 digestsize, u8 digest_type, u8 hash_algo, struct ima_field_data *field_data) { /* * digest formats: * - DATA_FMT_DIGEST: digest * - DATA_FMT_DIGEST_WITH_ALGO: <hash algo> + ':' + '\0' + digest, * - DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO: * <digest type> + ':' + <hash algo> + ':' + '\0' + digest, * * where 'DATA_FMT_DIGEST' is the original digest format ('d') * with a hash size limitation of 20 bytes, * where <digest type> is either "ima" or "verity", * where <hash algo> is the hash_algo_name[] string. */ u8 buffer[DIGEST_TYPE_NAME_LEN_MAX + CRYPTO_MAX_ALG_NAME + 2 + IMA_MAX_DIGEST_SIZE] = { 0 }; enum data_formats fmt = DATA_FMT_DIGEST; u32 offset = 0; if (digest_type < DIGEST_TYPE__LAST && hash_algo < HASH_ALGO__LAST) { fmt = DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO; offset += 1 + sprintf(buffer, "%s:%s:", digest_type_name[digest_type], hash_algo_name[hash_algo]); } else if (hash_algo < HASH_ALGO__LAST) { fmt = DATA_FMT_DIGEST_WITH_ALGO; offset += 1 + sprintf(buffer, "%s:", hash_algo_name[hash_algo]); } if (digest) { memcpy(buffer + offset, digest, digestsize); } else { /* * If digest is NULL, the event being recorded is a violation. * Make room for the digest by increasing the offset by the * hash algorithm digest size. If the hash algorithm is not * specified increase the offset by IMA_DIGEST_SIZE which * fits SHA1 or MD5 */ if (hash_algo < HASH_ALGO__LAST) offset += hash_digest_size[hash_algo]; else offset += IMA_DIGEST_SIZE; } return ima_write_template_field_data(buffer, offset + digestsize, fmt, field_data); } /* * This function writes the digest of an event (with size limit). */ int ima_eventdigest_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct ima_max_digest_data hash; struct ima_digest_data *hash_hdr = container_of(&hash.hdr, struct ima_digest_data, hdr); u8 *cur_digest = NULL; u32 cur_digestsize = 0; struct inode *inode; int result; memset(&hash, 0, sizeof(hash)); if (event_data->violation) /* recording a violation. */ goto out; if (ima_template_hash_algo_allowed(event_data->iint->ima_hash->algo)) { cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; goto out; } if ((const char *)event_data->filename == boot_aggregate_name) { if (ima_tpm_chip) { hash.hdr.algo = HASH_ALGO_SHA1; result = ima_calc_boot_aggregate(hash_hdr); /* algo can change depending on available PCR banks */ if (!result && hash.hdr.algo != HASH_ALGO_SHA1) result = -EINVAL; if (result < 0) memset(&hash, 0, sizeof(hash)); } cur_digest = hash_hdr->digest; cur_digestsize = hash_digest_size[HASH_ALGO_SHA1]; goto out; } if (!event_data->file) /* missing info to re-calculate the digest */ return -EINVAL; inode = file_inode(event_data->file); hash.hdr.algo = ima_template_hash_algo_allowed(ima_hash_algo) ? ima_hash_algo : HASH_ALGO_SHA1; result = ima_calc_file_hash(event_data->file, hash_hdr); if (result) { integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, event_data->filename, "collect_data", "failed", result, 0); return result; } cur_digest = hash_hdr->digest; cur_digestsize = hash.hdr.length; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, HASH_ALGO__LAST, field_data); } /* * This function writes the digest of an event (without size limit). */ int ima_eventdigest_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { u8 *cur_digest = NULL, hash_algo = ima_hash_algo; u32 cur_digestsize = 0; if (event_data->violation) /* recording a violation. */ goto out; cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; hash_algo = event_data->iint->ima_hash->algo; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, hash_algo, field_data); } /* * This function writes the digest of an event (without size limit), * prefixed with both the digest type and hash algorithm. */ int ima_eventdigest_ngv2_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { u8 *cur_digest = NULL, hash_algo = ima_hash_algo; u32 cur_digestsize = 0; u8 digest_type = DIGEST_TYPE_IMA; if (event_data->violation) /* recording a violation. */ goto out; cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; hash_algo = event_data->iint->ima_hash->algo; if (event_data->iint->flags & IMA_VERITY_REQUIRED) digest_type = DIGEST_TYPE_VERITY; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, digest_type, hash_algo, field_data); } /* * This function writes the digest of the file which is expected to match the * digest contained in the file's appended signature. */ int ima_eventdigest_modsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { enum hash_algo hash_algo; const u8 *cur_digest; u32 cur_digestsize; if (!event_data->modsig) return 0; if (event_data->violation) { /* Recording a violation. */ hash_algo = HASH_ALGO_SHA1; cur_digest = NULL; cur_digestsize = 0; } else { int rc; rc = ima_get_modsig_digest(event_data->modsig, &hash_algo, &cur_digest, &cur_digestsize); if (rc) return rc; else if (hash_algo == HASH_ALGO__LAST || cur_digestsize == 0) /* There was some error collecting the digest. */ return -EINVAL; } return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, hash_algo, field_data); } static int ima_eventname_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, bool size_limit) { const char *cur_filename = NULL; struct name_snapshot filename; u32 cur_filename_len = 0; bool snapshot = false; int ret; BUG_ON(event_data->filename == NULL && event_data->file == NULL); if (event_data->filename) { cur_filename = event_data->filename; cur_filename_len = strlen(event_data->filename); if (!size_limit || cur_filename_len <= IMA_EVENT_NAME_LEN_MAX) goto out; } if (event_data->file) { take_dentry_name_snapshot(&filename, event_data->file->f_path.dentry); snapshot = true; cur_filename = filename.name.name; cur_filename_len = strlen(cur_filename); } else /* * Truncate filename if the latter is too long and * the file descriptor is not available. */ cur_filename_len = IMA_EVENT_NAME_LEN_MAX; out: ret = ima_write_template_field_data(cur_filename, cur_filename_len, DATA_FMT_STRING, field_data); if (snapshot) release_dentry_name_snapshot(&filename); return ret; } /* * This function writes the name of an event (with size limit). */ int ima_eventname_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventname_init_common(event_data, field_data, true); } /* * This function writes the name of an event (without size limit). */ int ima_eventname_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventname_init_common(event_data, field_data, false); } /* * ima_eventsig_init - include the file signature as part of the template data */ int ima_eventsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct evm_ima_xattr_data *xattr_value = event_data->xattr_value; if (!xattr_value || (xattr_value->type != EVM_IMA_XATTR_DIGSIG && xattr_value->type != IMA_VERITY_DIGSIG)) return ima_eventevmsig_init(event_data, field_data); return ima_write_template_field_data(xattr_value, event_data->xattr_len, DATA_FMT_HEX, field_data); } /* * ima_eventbuf_init - include the buffer(kexec-cmldine) as part of the * template data. */ int ima_eventbuf_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { if ((!event_data->buf) || (event_data->buf_len == 0)) return 0; return ima_write_template_field_data(event_data->buf, event_data->buf_len, DATA_FMT_HEX, field_data); } /* * ima_eventmodsig_init - include the appended file signature as part of the * template data */ int ima_eventmodsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { const void *data; u32 data_len; int rc; if (!event_data->modsig) return 0; /* * modsig is a runtime structure containing pointers. Get its raw data * instead. */ rc = ima_get_raw_modsig(event_data->modsig, &data, &data_len); if (rc) return rc; return ima_write_template_field_data(data, data_len, DATA_FMT_HEX, field_data); } /* * ima_eventevmsig_init - include the EVM portable signature as part of the * template data */ int ima_eventevmsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct evm_ima_xattr_data *xattr_data = NULL; int rc = 0; if (!event_data->file) return 0; rc = vfs_getxattr_alloc(&nop_mnt_idmap, file_dentry(event_data->file), XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0 || xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG) { rc = 0; goto out; } rc = ima_write_template_field_data((char *)xattr_data, rc, DATA_FMT_HEX, field_data); out: kfree(xattr_data); return rc; } static int ima_eventinodedac_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, bool get_uid) { unsigned int id; if (!event_data->file) return 0; if (get_uid) id = i_uid_read(file_inode(event_data->file)); else id = i_gid_read(file_inode(event_data->file)); if (ima_canonical_fmt) { if (sizeof(id) == sizeof(u16)) id = (__force u16)cpu_to_le16(id); else id = (__force u32)cpu_to_le32(id); } return ima_write_template_field_data((void *)&id, sizeof(id), DATA_FMT_UINT, field_data); } /* * ima_eventinodeuid_init - include the inode UID as part of the template * data */ int ima_eventinodeuid_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodedac_init_common(event_data, field_data, true); } /* * ima_eventinodegid_init - include the inode GID as part of the template * data */ int ima_eventinodegid_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodedac_init_common(event_data, field_data, false); } /* * ima_eventinodemode_init - include the inode mode as part of the template * data */ int ima_eventinodemode_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct inode *inode; u16 mode; if (!event_data->file) return 0; inode = file_inode(event_data->file); mode = inode->i_mode; if (ima_canonical_fmt) mode = (__force u16)cpu_to_le16(mode); return ima_write_template_field_data((char *)&mode, sizeof(mode), DATA_FMT_UINT, field_data); } static int ima_eventinodexattrs_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, char type) { u8 *buffer = NULL; int rc; if (!event_data->file) return 0; rc = evm_read_protected_xattrs(file_dentry(event_data->file), NULL, 0, type, ima_canonical_fmt); if (rc < 0) return 0; buffer = kmalloc(rc, GFP_KERNEL); if (!buffer) return 0; rc = evm_read_protected_xattrs(file_dentry(event_data->file), buffer, rc, type, ima_canonical_fmt); if (rc < 0) { rc = 0; goto out; } rc = ima_write_template_field_data((char *)buffer, rc, DATA_FMT_HEX, field_data); out: kfree(buffer); return rc; } /* * ima_eventinodexattrnames_init - include a list of xattr names as part of the * template data */ int ima_eventinodexattrnames_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'n'); } /* * ima_eventinodexattrlengths_init - include a list of xattr lengths as part of * the template data */ int ima_eventinodexattrlengths_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'l'); } /* * ima_eventinodexattrvalues_init - include a list of xattr values as part of * the template data */ int ima_eventinodexattrvalues_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'v'); }
107 26 114 129 2287 2843 234 234 234 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/writeback.h */ #ifndef WRITEBACK_H #define WRITEBACK_H #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/fs.h> #include <linux/flex_proportions.h> #include <linux/backing-dev-defs.h> #include <linux/blk_types.h> #include <linux/pagevec.h> struct bio; DECLARE_PER_CPU(int, dirty_throttle_leaks); /* * The global dirty threshold is normally equal to the global dirty limit, * except when the system suddenly allocates a lot of anonymous memory and * knocks down the global dirty threshold quickly, in which case the global * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. */ #define DIRTY_SCOPE 8 struct backing_dev_info; /* * fs/fs-writeback.c */ enum writeback_sync_modes { WB_SYNC_NONE, /* Don't wait on anything */ WB_SYNC_ALL, /* Wait on every mapping */ }; /* * A control structure which tells the writeback code what to do. These are * always on the stack, and hence need no locking. They are always initialised * in a manner such that unspecified fields are set to zero. */ struct writeback_control { /* public fields that can be set and/or consumed by the caller: */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ /* * For a_ops->writepages(): if start or end are non-zero then this is * a hint that the filesystem need only write out the pages inside that * byterange. The byte at `end' is included in the writeout request. */ loff_t range_start; loff_t range_end; enum writeback_sync_modes sync_mode; unsigned for_kupdate:1; /* A kupdate writeback */ unsigned for_background:1; /* A background writeback */ unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned unpinned_netfs_wb:1; /* Cleared I_PINNING_NETFS_WB */ /* * When writeback IOs are bounced through async layers, only the * initial synchronous phase should be accounted towards inode * cgroup ownership arbitration to avoid confusion. Later stages * can set the following flag to disable the accounting. */ unsigned no_cgroup_owner:1; /* internal fields used by the ->writepages implementation: */ struct folio_batch fbatch; pgoff_t index; int saved_err; #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ struct inode *inode; /* inode being written out */ /* foreign inode detection, see wbc_detach_inode() */ int wb_id; /* current wb id */ int wb_lcand_id; /* last foreign candidate wb id */ int wb_tcand_id; /* this foreign candidate wb id */ size_t wb_bytes; /* bytes written by current wb */ size_t wb_lcand_bytes; /* bytes written by last candidate */ size_t wb_tcand_bytes; /* bytes written by this candidate */ #endif }; static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc) { blk_opf_t flags = 0; if (wbc->sync_mode == WB_SYNC_ALL) flags |= REQ_SYNC; else if (wbc->for_kupdate || wbc->for_background) flags |= REQ_BACKGROUND; return flags; } #ifdef CONFIG_CGROUP_WRITEBACK #define wbc_blkcg_css(wbc) \ ((wbc)->wb ? (wbc)->wb->blkcg_css : blkcg_root_css) #else #define wbc_blkcg_css(wbc) (blkcg_root_css) #endif /* CONFIG_CGROUP_WRITEBACK */ /* * A wb_domain represents a domain that wb's (bdi_writeback's) belong to * and are measured against each other in. There always is one global * domain, global_wb_domain, that every wb in the system is a member of. * This allows measuring the relative bandwidth of each wb to distribute * dirtyable memory accordingly. */ struct wb_domain { spinlock_t lock; /* * Scale the writeback cache size proportional to the relative * writeout speed. * * We do this by keeping a floating proportion between BDIs, based * on page writeback completions [end_page_writeback()]. Those * devices that write out pages fastest will get the larger share, * while the slower will get a smaller share. * * We use page writeout completions because we are interested in * getting rid of dirty pages. Having them written out is the * primary goal. * * We introduce a concept of time, a period over which we measure * these events, because demand can/will vary over time. The length * of this period itself is measured in page writeback completions. */ struct fprop_global completions; struct timer_list period_timer; /* timer for aging of completions */ unsigned long period_time; /* * The dirtyable memory and dirty threshold could be suddenly * knocked down by a large amount (eg. on the startup of KVM in a * swapless system). This may throw the system into deep dirty * exceeded state and throttle heavy/light dirtiers alike. To * retain good responsiveness, maintain global_dirty_limit for * tracking slowly down to the knocked down dirty threshold. * * Both fields are protected by ->lock. */ unsigned long dirty_limit_tstamp; unsigned long dirty_limit; }; /** * wb_domain_size_changed - memory available to a wb_domain has changed * @dom: wb_domain of interest * * This function should be called when the amount of memory available to * @dom has changed. It resets @dom's dirty limit parameters to prevent * the past values which don't match the current configuration from skewing * dirty throttling. Without this, when memory size of a wb_domain is * greatly reduced, the dirty throttling logic may allow too many pages to * be dirtied leading to consecutive unnecessary OOMs and may get stuck in * that situation. */ static inline void wb_domain_size_changed(struct wb_domain *dom) { spin_lock(&dom->lock); dom->dirty_limit_tstamp = jiffies; dom->dirty_limit = 0; spin_unlock(&dom->lock); } /* * fs/fs-writeback.c */ struct bdi_writeback; void writeback_inodes_sb(struct super_block *, enum wb_reason reason); void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(enum wb_reason reason); void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); void inode_io_list_del(struct inode *inode); static inline xa_mark_t wbc_to_tag(struct writeback_control *wbc) { if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) return PAGECACHE_TAG_TOWRITE; return PAGECACHE_TAG_DIRTY; } #ifdef CONFIG_CGROUP_WRITEBACK #include <linux/cgroup.h> #include <linux/bio.h> void __inode_attach_wb(struct inode *inode, struct folio *folio); void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes); int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done); void cgroup_writeback_umount(struct super_block *sb); bool cleanup_offline_cgwb(struct bdi_writeback *wb); /** * inode_attach_wb - associate an inode with its wb * @inode: inode of interest * @folio: folio being dirtied (may be NULL) * * If @inode doesn't have its wb, associate it with the wb matching the * memcg of @folio or, if @folio is NULL, %current. May be called w/ or w/o * @inode->i_lock. */ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { if (!inode->i_wb) __inode_attach_wb(inode, folio); } /** * inode_detach_wb - disassociate an inode from its wb * @inode: inode of interest * * @inode is being freed. Detach from its wb. */ static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { WARN_ON_ONCE(!(inode_state_read_once(inode) & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } } void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode); /** * wbc_init_bio - writeback specific initializtion of bio * @wbc: writeback_control for the writeback in progress * @bio: bio to be initialized * * @bio is a part of the writeback in progress controlled by @wbc. Perform * writeback specific initialization. This is used to apply the cgroup * writeback context. Must be called after the bio has been associated with * a device. */ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { /* * pageout() path doesn't attach @wbc to the inode being written * out. This is intentional as we don't want the function to block * behind a slow cgroup. Ultimately, we want pageout() to kick off * regular writeback instead of writing things out itself. */ if (wbc->wb) bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } void inode_switch_wbs_work_fn(struct work_struct *work); #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) { } static inline void inode_detach_wb(struct inode *inode) { } static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) { } static inline void wbc_detach_inode(struct writeback_control *wbc) { } static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { } static inline void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes) { } static inline void cgroup_writeback_umount(struct super_block *sb) { } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * mm/page-writeback.c */ /* consolidated parameters for balance_dirty_pages() and its subroutines */ struct dirty_throttle_control { #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *dom; struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ #endif struct bdi_writeback *wb; struct fprop_local_percpu *wb_completions; unsigned long avail; /* dirtyable */ unsigned long dirty; /* file_dirty + write + nfs */ unsigned long thresh; /* dirty threshold */ unsigned long bg_thresh; /* dirty background threshold */ unsigned long limit; /* hard dirty limit */ unsigned long wb_dirty; /* per-wb counterparts */ unsigned long wb_thresh; unsigned long wb_bg_thresh; unsigned long pos_ratio; bool freerun; bool dirty_exceeded; }; void laptop_io_completion(struct backing_dev_info *info); void laptop_sync_completion(void); void laptop_mode_timer_fn(struct timer_list *t); bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom); #endif extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; extern int laptop_mode; void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); unsigned long cgwb_calc_thresh(struct bdi_writeback *wb); void wb_update_bandwidth(struct bdi_writeback *wb); /* Invoke balance dirty pages in async mode. */ #define BDP_ASYNC 0x0001 void balance_dirty_pages_ratelimited(struct address_space *mapping); int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, unsigned int flags); bool wb_over_bg_thresh(struct bdi_writeback *wb); struct folio *writeback_iter(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio, int *error); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio); bool folio_redirty_for_writepage(struct writeback_control *, struct folio *); bool redirty_page_for_writepage(struct writeback_control *, struct page *); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); /* * 4MB minimal write chunk size */ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) #endif /* WRITEBACK_H */
31 56 90 64 64 16 64 54 42 64 64 63 40 63 6 63 17 63 2 64 16 64 64 14 8 3 14 14 12 12 12 14 4 10 10 15 26 26 26 11 13 26 26 6 22 2 20 20 5 6 14 20 15 20 10 15 15 4 15 10 10 15 17 2 2 84 91 90 87 56 56 88 88 63 64 84 26 26 56 56 55 250 251 251 113 114 108 100 50 230 3 70 69 70 431 242 432 404 36 430 119 430 46 428 112 9 113 113 113 112 112 111 3 111 57 57 45 7 10 108 103 105 108 82 107 108 104 102 23 103 103 57 46 10 45 72 67 78 112 11 9 9 102 110 66 66 66 58 11 11 47 97 9 28 27 1 26 25 7 6 24 23 22 21 23 28 153 153 2 153 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright (C) 2016 - 2020 Christoph Hellwig */ #include <linux/init.h> #include <linux/mm.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/uio.h> #include <linux/namei.h> #include <linux/task_io_accounting_ops.h> #include <linux/falloc.h> #include <linux/suspend.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/module.h> #include <linux/io_uring/cmd.h> #include "blk.h" static inline struct inode *bdev_file_inode(struct file *file) { return file->f_mapping->host; } static blk_opf_t dio_bio_write_op(struct kiocb *iocb) { blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; /* avoid the need for a I/O completion work item */ if (iocb_is_dsync(iocb)) opf |= REQ_FUA; return opf; } static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, struct iov_iter *iter) { return (iocb->ki_pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1); } static inline int blkdev_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, struct block_device *bdev) { return bio_iov_iter_get_pages(bio, iter, bdev_logical_block_size(bdev) - 1); } #define DIO_INLINE_BIO_VECS 4 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, struct block_device *bdev, unsigned int nr_pages) { struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); if (!vecs) return -ENOMEM; } if (iov_iter_rw(iter) == READ) { bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ); if (user_backed_iter(iter)) should_dirty = true; } else { bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio.bi_write_stream = iocb->ki_write_stream; bio.bi_ioprio = iocb->ki_ioprio; if (iocb->ki_flags & IOCB_ATOMIC) bio.bi_opf |= REQ_ATOMIC; ret = blkdev_iov_iter_get_pages(&bio, iter, bdev); if (unlikely(ret)) goto out; ret = bio.bi_iter.bi_size; if (iov_iter_rw(iter) == WRITE) task_io_account_write(ret); if (iocb->ki_flags & IOCB_NOWAIT) bio.bi_opf |= REQ_NOWAIT; submit_bio_wait(&bio); bio_release_pages(&bio, should_dirty); if (unlikely(bio.bi_status)) ret = blk_status_to_errno(bio.bi_status); out: if (vecs != inline_vecs) kfree(vecs); bio_uninit(&bio); return ret; } enum { DIO_SHOULD_DIRTY = 1, DIO_IS_SYNC = 2, }; struct blkdev_dio { union { struct kiocb *iocb; struct task_struct *waiter; }; size_t size; atomic_t ref; unsigned int flags; struct bio bio ____cacheline_aligned_in_smp; }; static struct bio_set blkdev_dio_pool; static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; bool is_sync = dio->flags & DIO_IS_SYNC; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; if (bio_integrity(bio)) bio_integrity_unmap_user(bio); if (atomic_dec_and_test(&dio->ref)) { if (!is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; WRITE_ONCE(iocb->private, NULL); if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; } else { ret = blk_status_to_errno(dio->bio.bi_status); } dio->iocb->ki_complete(iocb, ret); bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; WRITE_ONCE(dio->waiter, NULL); blk_wake_io_task(waiter); } } if (should_dirty) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); bio_put(bio); } } static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, struct block_device *bdev, unsigned int nr_pages) { struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; bool is_read = (iov_iter_rw(iter) == READ), is_sync; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); loff_t pos = iocb->ki_pos; int ret = 0; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); atomic_set(&dio->ref, 1); /* * Grab an extra reference to ensure the dio structure which is embedded * into the first bio stays around. */ bio_get(bio); is_sync = is_sync_kiocb(iocb); if (is_sync) { dio->flags = DIO_IS_SYNC; dio->waiter = current; } else { dio->flags = 0; dio->iocb = iocb; } dio->size = 0; if (is_read && user_backed_iter(iter)) dio->flags |= DIO_SHOULD_DIRTY; blk_start_plug(&plug); for (;;) { bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_write_stream = iocb->ki_write_stream; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; ret = blkdev_iov_iter_get_pages(bio, iter, bdev); if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; } if (iocb->ki_flags & IOCB_NOWAIT) { /* * This is nonblocking IO, and we need to allocate * another bio if we have data left to map. As we * cannot guarantee that one of the sub bios will not * fail getting issued FOR NOWAIT and as error results * are coalesced across all of them, be safe and ask for * a retry of this from blocking context. */ if (unlikely(iov_iter_count(iter))) { ret = -EAGAIN; goto fail; } bio->bi_opf |= REQ_NOWAIT; } if (iocb->ki_flags & IOCB_HAS_METADATA) { ret = bio_integrity_map_iter(bio, iocb->private); if (unlikely(ret)) goto fail; } if (is_read) { if (dio->flags & DIO_SHOULD_DIRTY) bio_set_pages_dirty(bio); } else { task_io_account_write(bio->bi_iter.bi_size); } dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); if (!nr_pages) { submit_bio(bio); break; } atomic_inc(&dio->ref); submit_bio(bio); bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL); } blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(dio->waiter)) break; blk_io_schedule(); } __set_current_state(TASK_RUNNING); if (!ret) ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) ret = dio->size; bio_put(&dio->bio); return ret; fail: bio_release_pages(bio, false); bio_clear_flag(bio, BIO_REFFED); bio_put(bio); blk_finish_plug(&plug); return ret; } static void blkdev_bio_end_io_async(struct bio *bio) { struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio); struct kiocb *iocb = dio->iocb; ssize_t ret; WRITE_ONCE(iocb->private, NULL); if (likely(!bio->bi_status)) { ret = dio->size; iocb->ki_pos += ret; } else { ret = blk_status_to_errno(bio->bi_status); } if (bio_integrity(bio)) bio_integrity_unmap_user(bio); iocb->ki_complete(iocb, ret); if (dio->flags & DIO_SHOULD_DIRTY) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); bio_put(bio); } } static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, struct iov_iter *iter, struct block_device *bdev, unsigned int nr_pages) { bool is_read = iov_iter_rw(iter) == READ; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); struct blkdev_dio *dio; struct bio *bio; loff_t pos = iocb->ki_pos; int ret = 0; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->flags = 0; dio->iocb = iocb; bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_write_stream = iocb->ki_write_stream; bio->bi_end_io = blkdev_bio_end_io_async; bio->bi_ioprio = iocb->ki_ioprio; if (iov_iter_is_bvec(iter)) { /* * Users don't rely on the iterator being in any particular * state for async I/O returning -EIOCBQUEUED, hence we can * avoid expensive iov_iter_advance(). Bypass * bio_iov_iter_get_pages() and set the bvec directly. */ bio_iov_bvec_set(bio, iter); } else { ret = blkdev_iov_iter_get_pages(bio, iter, bdev); if (unlikely(ret)) goto out_bio_put; } dio->size = bio->bi_iter.bi_size; if (is_read) { if (user_backed_iter(iter)) { dio->flags |= DIO_SHOULD_DIRTY; bio_set_pages_dirty(bio); } } else { task_io_account_write(bio->bi_iter.bi_size); } if (iocb->ki_flags & IOCB_HAS_METADATA) { ret = bio_integrity_map_iter(bio, iocb->private); WRITE_ONCE(iocb->private, NULL); if (unlikely(ret)) goto out_bio_put; } if (iocb->ki_flags & IOCB_ATOMIC) bio->bi_opf |= REQ_ATOMIC; if (iocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; if (iocb->ki_flags & IOCB_HIPRI) { bio->bi_opf |= REQ_POLLED; submit_bio(bio); WRITE_ONCE(iocb->private, bio); } else { submit_bio(bio); } return -EIOCBQUEUED; out_bio_put: bio_put(bio); return ret; } static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; if (blkdev_dio_invalid(bdev, iocb, iter)) return -EINVAL; if (iov_iter_rw(iter) == WRITE) { u16 max_write_streams = bdev_max_write_streams(bdev); if (iocb->ki_write_stream) { if (iocb->ki_write_stream > max_write_streams) return -EINVAL; } else if (max_write_streams) { enum rw_hint write_hint = file_inode(iocb->ki_filp)->i_write_hint; /* * Just use the write hint as write stream for block * device writes. This assumes no file system is * mounted that would use the streams differently. */ if (write_hint <= max_write_streams) iocb->ki_write_stream = write_hint; } } nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); if (likely(nr_pages <= BIO_MAX_VECS && !(iocb->ki_flags & IOCB_HAS_METADATA))) { if (is_sync_kiocb(iocb)) return __blkdev_direct_IO_simple(iocb, iter, bdev, nr_pages); return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); } else if (iocb->ki_flags & IOCB_ATOMIC) { return -EINVAL; } return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); } static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { struct block_device *bdev = I_BDEV(inode); loff_t isize = i_size_read(inode); if (offset >= isize) return -EIO; iomap->bdev = bdev; iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); iomap->type = IOMAP_MAPPED; iomap->addr = iomap->offset; iomap->length = isize - iomap->offset; iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */ return 0; } static const struct iomap_ops blkdev_iomap_ops = { .iomap_begin = blkdev_iomap_begin, }; #ifdef CONFIG_BUFFER_HEAD static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { bh->b_bdev = I_BDEV(inode); bh->b_blocknr = iblock; set_buffer_mapped(bh); return 0; } /* * We cannot call mpage_writepages() as it does not take the buffer lock. * We must use block_write_full_folio() directly which holds the buffer * lock. The buffer lock provides the synchronisation with writeback * that filesystems rely on when they use the blockdev's mapping. */ static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct folio *folio = NULL; struct blk_plug plug; int err; blk_start_plug(&plug); while ((folio = writeback_iter(mapping, wbc, folio, &err))) err = block_write_full_folio(folio, wbc, blkdev_get_block); blk_finish_plug(&plug); return err; } static int blkdev_read_folio(struct file *file, struct folio *folio) { return block_read_full_folio(folio, blkdev_get_block); } static void blkdev_readahead(struct readahead_control *rac) { mpage_readahead(rac, blkdev_get_block); } static int blkdev_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { return block_write_begin(mapping, pos, len, foliop, blkdev_get_block); } static int blkdev_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { int ret; ret = block_write_end(pos, len, copied, folio); folio_unlock(folio); folio_put(folio); return ret; } const struct address_space_operations def_blk_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = blkdev_read_folio, .readahead = blkdev_readahead, .writepages = blkdev_writepages, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; #else /* CONFIG_BUFFER_HEAD */ static int blkdev_read_folio(struct file *file, struct folio *folio) { iomap_bio_read_folio(folio, &blkdev_iomap_ops); return 0; } static void blkdev_readahead(struct readahead_control *rac) { iomap_bio_readahead(rac, &blkdev_iomap_ops); } static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc, struct folio *folio, u64 offset, unsigned int len, u64 end_pos) { loff_t isize = i_size_read(wpc->inode); if (WARN_ON_ONCE(offset >= isize)) return -EIO; if (offset < wpc->iomap.offset || offset >= wpc->iomap.offset + wpc->iomap.length) { int error; error = blkdev_iomap_begin(wpc->inode, offset, isize - offset, IOMAP_WRITE, &wpc->iomap, NULL); if (error) return error; } return iomap_add_to_ioend(wpc, folio, offset, end_pos, len); } static const struct iomap_writeback_ops blkdev_writeback_ops = { .writeback_range = blkdev_writeback_range, .writeback_submit = iomap_ioend_writeback_submit, }; static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct iomap_writepage_ctx wpc = { .inode = mapping->host, .wbc = wbc, .ops = &blkdev_writeback_ops }; return iomap_writepages(&wpc); } const struct address_space_operations def_blk_aops = { .dirty_folio = filemap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .read_folio = blkdev_read_folio, .readahead = blkdev_readahead, .writepages = blkdev_writepages, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .migrate_folio = filemap_migrate_folio, }; #endif /* CONFIG_BUFFER_HEAD */ /* * for a block special file file_inode(file)->i_size is zero * so we compute the size by hand (just as in block_read/write above) */ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) { struct inode *bd_inode = bdev_file_inode(file); loff_t retval; inode_lock(bd_inode); retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); inode_unlock(bd_inode); return retval; } static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); int error; error = file_write_and_wait_range(filp, start, end); if (error) return error; /* * There is no need to serialise calls to blkdev_issue_flush with * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ error = blkdev_issue_flush(bdev); if (error == -EOPNOTSUPP) error = 0; return error; } /** * file_to_blk_mode - get block open flags from file flags * @file: file whose open flags should be converted * * Look at file open flags and generate corresponding block open flags from * them. The function works both for file just being open (e.g. during ->open * callback) and for file that is already open. This is actually non-trivial * (see comment in the function). */ blk_mode_t file_to_blk_mode(struct file *file) { blk_mode_t mode = 0; if (file->f_mode & FMODE_READ) mode |= BLK_OPEN_READ; if (file->f_mode & FMODE_WRITE) mode |= BLK_OPEN_WRITE; /* * do_dentry_open() clears O_EXCL from f_flags, use file->private_data * to determine whether the open was exclusive for already open files. */ if (file->private_data) mode |= BLK_OPEN_EXCL; else if (file->f_flags & O_EXCL) mode |= BLK_OPEN_EXCL; if (file->f_flags & O_NDELAY) mode |= BLK_OPEN_NDELAY; /* * If all bits in O_ACCMODE set (aka O_RDWR | O_WRONLY), the floppy * driver has historically allowed ioctls as if the file was opened for * writing, but does not allow and actual reads or writes. */ if ((file->f_flags & O_ACCMODE) == (O_RDWR | O_WRONLY)) mode |= BLK_OPEN_WRITE_IOCTL; return mode; } static int blkdev_open(struct inode *inode, struct file *filp) { struct block_device *bdev; blk_mode_t mode; int ret; mode = file_to_blk_mode(filp); /* Use the file as the holder. */ if (mode & BLK_OPEN_EXCL) filp->private_data = filp; ret = bdev_permission(inode->i_rdev, mode, filp->private_data); if (ret) return ret; bdev = blkdev_get_no_open(inode->i_rdev, true); if (!bdev) return -ENXIO; if (bdev_can_atomic_write(bdev)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; if (blk_get_integrity(bdev->bd_disk)) filp->f_mode |= FMODE_HAS_METADATA; ret = bdev_open(bdev, mode, filp->private_data, NULL, filp); if (ret) blkdev_put_no_open(bdev); return ret; } static int blkdev_release(struct inode *inode, struct file *filp) { bdev_release(filp); return 0; } static ssize_t blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) { size_t count = iov_iter_count(from); ssize_t written; written = kiocb_invalidate_pages(iocb, count); if (written) { if (written == -EBUSY) return 0; return written; } written = blkdev_direct_IO(iocb, from); if (written > 0) { kiocb_invalidate_post_direct_write(iocb, count); iocb->ki_pos += written; count -= written; } if (written != -EIOCBQUEUED) iov_iter_revert(from, count - iov_iter_count(from)); return written; } static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) { return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL, NULL); } /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. * * Does not take i_mutex for the write and thus is not for general purpose * use. */ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *bd_inode = bdev_file_inode(file); struct block_device *bdev = I_BDEV(bd_inode); bool atomic = iocb->ki_flags & IOCB_ATOMIC; loff_t size = bdev_nr_bytes(bdev); size_t shorted = 0; ssize_t ret; if (bdev_read_only(bdev)) return -EPERM; if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) return -ETXTBSY; if (!iov_iter_count(from)) return 0; if (iocb->ki_pos >= size) return -ENOSPC; if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; if (atomic) { ret = generic_atomic_write_valid(iocb, from); if (ret) return ret; } size -= iocb->ki_pos; if (iov_iter_count(from) > size) { if (atomic) return -EINVAL; shorted = iov_iter_count(from) - size; iov_iter_truncate(from, size); } ret = file_update_time(file); if (ret) return ret; if (iocb->ki_flags & IOCB_DIRECT) { ret = blkdev_direct_write(iocb, from); if (ret >= 0 && iov_iter_count(from)) ret = direct_write_fallback(iocb, from, ret, blkdev_buffered_write(iocb, from)); } else { /* * Take i_rwsem and invalidate_lock to avoid racing with * set_blocksize changing i_blkbits/folio order and punching * out the pagecache. */ inode_lock_shared(bd_inode); ret = blkdev_buffered_write(iocb, from); inode_unlock_shared(bd_inode); } if (ret > 0) ret = generic_write_sync(iocb, ret); iov_iter_reexpand(from, iov_iter_count(from) + shorted); return ret; } static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *bd_inode = bdev_file_inode(iocb->ki_filp); struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); loff_t size = bdev_nr_bytes(bdev); loff_t pos = iocb->ki_pos; size_t shorted = 0; ssize_t ret = 0; size_t count; if (unlikely(pos + iov_iter_count(to) > size)) { if (pos >= size) return 0; size -= pos; shorted = iov_iter_count(to) - size; iov_iter_truncate(to, size); } count = iov_iter_count(to); if (!count) goto reexpand; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { ret = kiocb_write_and_wait(iocb, count); if (ret < 0) goto reexpand; file_accessed(iocb->ki_filp); ret = blkdev_direct_IO(iocb, to); if (ret > 0) { iocb->ki_pos += ret; count -= ret; } if (ret != -EIOCBQUEUED) iov_iter_revert(to, count - iov_iter_count(to)); if (ret < 0 || !count) goto reexpand; } /* * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize * changing i_blkbits/folio order and punching out the pagecache. */ inode_lock_shared(bd_inode); ret = filemap_read(iocb, to, ret); inode_unlock_shared(bd_inode); reexpand: if (unlikely(shorted)) iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; } #define BLKDEV_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_ZERO_RANGE | FALLOC_FL_WRITE_ZEROES) static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { struct inode *inode = bdev_file_inode(file); struct block_device *bdev = I_BDEV(inode); loff_t end = start + len - 1; loff_t isize; unsigned int flags; int error; /* Fail if we don't recognize the flags. */ if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; /* * Don't allow writing zeroes if the device does not enable the * unmap write zeroes operation. */ if ((mode & FALLOC_FL_WRITE_ZEROES) && !bdev_write_zeroes_unmap_sectors(bdev)) return -EOPNOTSUPP; /* Don't go off the end of the device. */ isize = bdev_nr_bytes(bdev); if (start >= isize) return -EINVAL; if (end >= isize) { if (mode & FALLOC_FL_KEEP_SIZE) { len = isize - start; end = start + len - 1; } else return -EINVAL; } /* * Don't allow IO that isn't aligned to logical block size. */ if ((start | len) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); switch (mode) { case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: flags = BLKDEV_ZERO_NOUNMAP; break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: flags = BLKDEV_ZERO_NOFALLBACK; break; case FALLOC_FL_WRITE_ZEROES: flags = 0; break; default: error = -EOPNOTSUPP; goto fail; } /* * Invalidate the page cache, including dirty pages, for valid * de-allocate mode calls to fallocate(). */ error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); if (error) goto fail; error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, len >> SECTOR_SHIFT, GFP_KERNEL, flags); fail: filemap_invalidate_unlock(inode->i_mapping); inode_unlock(inode); return error; } static int blkdev_mmap_prepare(struct vm_area_desc *desc) { struct file *file = desc->file; if (bdev_read_only(I_BDEV(bdev_file_inode(file)))) return generic_file_readonly_mmap_prepare(desc); return generic_file_mmap_prepare(desc); } const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_release, .llseek = blkdev_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, .iopoll = iocb_bio_iopoll, .mmap_prepare = blkdev_mmap_prepare, .fsync = blkdev_fsync, .unlocked_ioctl = blkdev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, .uring_cmd = blkdev_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC, }; static __init int blkdev_init(void) { return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); } module_init(blkdev_init);
36 36 56 48 48 48 56 56 56 56 56 56 56 56 56 56 36 36 36 28 56 55 56 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. * Copyright (c) 2016-2021 Christoph Hellwig. */ #include <linux/iomap.h> #include "trace.h" static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) { if (iter->fbatch) { folio_batch_release(iter->fbatch); kfree(iter->fbatch); iter->fbatch = NULL; } iter->status = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); } /* Advance the current iterator position and decrement the remaining length */ int iomap_iter_advance(struct iomap_iter *iter, u64 count) { if (WARN_ON_ONCE(count > iomap_length(iter))) return -EIO; iter->pos += count; iter->len -= count; return 0; } static inline void iomap_iter_done(struct iomap_iter *iter) { WARN_ON_ONCE(iter->iomap.offset > iter->pos); WARN_ON_ONCE(iter->iomap.length == 0); WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); iter->iter_start_pos = iter->pos; trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) trace_iomap_iter_srcmap(iter->inode, &iter->srcmap); } /** * iomap_iter - iterate over a ranges in a file * @iter: iteration structue * @ops: iomap ops provided by the file system * * Iterate over filesystem-provided space mappings for the provided file range. * * This function handles cleanup of resources acquired for iteration when the * filesystem indicates there are no more space mappings, which means that this * function must be called in a loop that continues as long it returns a * positive value. If 0 or a negative value is returned, the caller must not * return to the loop body. Within a loop body, there are two ways to break out * of the loop body: leave @iter.status unchanged, or set it to a negative * errno. */ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) { bool stale = iter->iomap.flags & IOMAP_F_STALE; ssize_t advanced; u64 olen; int ret; trace_iomap_iter(iter, ops, _RET_IP_); if (!iter->iomap.length) goto begin; /* * Calculate how far the iter was advanced and the original length bytes * for ->iomap_end(). */ advanced = iter->pos - iter->iter_start_pos; olen = iter->len + advanced; if (ops->iomap_end) { ret = ops->iomap_end(iter->inode, iter->iter_start_pos, iomap_length_trim(iter, iter->iter_start_pos, olen), advanced, iter->flags, &iter->iomap); if (ret < 0 && !advanced) return ret; } /* detect old return semantics where this would advance */ if (WARN_ON_ONCE(iter->status > 0)) iter->status = -EIO; /* * Use iter->len to determine whether to continue onto the next mapping. * Explicitly terminate on error status or if the current iter has not * advanced at all (i.e. no work was done for some reason) unless the * mapping has been marked stale and needs to be reprocessed. */ if (iter->status < 0) ret = iter->status; else if (iter->len == 0 || (!advanced && !stale)) ret = 0; else ret = 1; iomap_iter_reset_iomap(iter); if (ret <= 0) return ret; begin: ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags, &iter->iomap, &iter->srcmap); if (ret < 0) return ret; iomap_iter_done(iter); return 1; }
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. #include <linux/debugfs.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <net/udp_tunnel.h> #include "netdevsim.h" static int nsim_udp_tunnel_set_port(struct net_device *dev, unsigned int table, unsigned int entry, struct udp_tunnel_info *ti) { struct netdevsim *ns = netdev_priv(dev); int ret; ret = -ns->udp_ports.inject_error; ns->udp_ports.inject_error = 0; if (!ret) { if (ns->udp_ports.ports[table][entry]) { WARN(1, "entry already in use\n"); ret = -EBUSY; } else { ns->udp_ports.ports[table][entry] = be16_to_cpu(ti->port) << 16 | ti->type; } } netdev_info(dev, "set [%d, %d] type %d family %d port %d - %d\n", table, entry, ti->type, ti->sa_family, ntohs(ti->port), ret); return ret; } static int nsim_udp_tunnel_unset_port(struct net_device *dev, unsigned int table, unsigned int entry, struct udp_tunnel_info *ti) { struct netdevsim *ns = netdev_priv(dev); int ret; ret = -ns->udp_ports.inject_error; ns->udp_ports.inject_error = 0; if (!ret) { u32 val = be16_to_cpu(ti->port) << 16 | ti->type; if (val == ns->udp_ports.ports[table][entry]) { ns->udp_ports.ports[table][entry] = 0; } else { WARN(1, "entry not installed %x vs %x\n", val, ns->udp_ports.ports[table][entry]); ret = -ENOENT; } } netdev_info(dev, "unset [%d, %d] type %d family %d port %d - %d\n", table, entry, ti->type, ti->sa_family, ntohs(ti->port), ret); return ret; } static int nsim_udp_tunnel_sync_table(struct net_device *dev, unsigned int table) { struct netdevsim *ns = netdev_priv(dev); struct udp_tunnel_info ti; unsigned int i; int ret; ret = -ns->udp_ports.inject_error; ns->udp_ports.inject_error = 0; for (i = 0; i < NSIM_UDP_TUNNEL_N_PORTS; i++) { udp_tunnel_nic_get_port(dev, table, i, &ti); ns->udp_ports.ports[table][i] = be16_to_cpu(ti.port) << 16 | ti.type; } return ret; } static const struct udp_tunnel_nic_info nsim_udp_tunnel_info = { .set_port = nsim_udp_tunnel_set_port, .unset_port = nsim_udp_tunnel_unset_port, .sync_table = nsim_udp_tunnel_sync_table, .tables = { { .n_entries = NSIM_UDP_TUNNEL_N_PORTS, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, }, { .n_entries = NSIM_UDP_TUNNEL_N_PORTS, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE | UDP_TUNNEL_TYPE_VXLAN_GPE, }, }, }; static ssize_t nsim_udp_tunnels_info_reset_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct net_device *dev = file->private_data; struct netdevsim *ns = netdev_priv(dev); if (dev->reg_state == NETREG_REGISTERED) { memset(ns->udp_ports.ports, 0, sizeof(ns->udp_ports.__ports)); udp_tunnel_nic_reset_ntf(dev); } return count; } static const struct file_operations nsim_udp_tunnels_info_reset_fops = { .open = simple_open, .write = nsim_udp_tunnels_info_reset_write, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; int nsim_udp_tunnels_info_create(struct nsim_dev *nsim_dev, struct net_device *dev) { struct netdevsim *ns = netdev_priv(dev); struct udp_tunnel_nic_info *info; if (nsim_dev->udp_ports.shared && nsim_dev->udp_ports.open_only) { dev_err(&nsim_dev->nsim_bus_dev->dev, "shared can't be used in conjunction with open_only\n"); return -EINVAL; } if (!nsim_dev->udp_ports.shared) ns->udp_ports.ports = ns->udp_ports.__ports; else ns->udp_ports.ports = nsim_dev->udp_ports.__ports; ns->udp_ports.ddir = debugfs_create_dir("udp_ports", ns->nsim_dev_port->ddir); debugfs_create_u32("inject_error", 0600, ns->udp_ports.ddir, &ns->udp_ports.inject_error); ns->udp_ports.dfs_ports[0].array = ns->udp_ports.ports[0]; ns->udp_ports.dfs_ports[0].n_elements = NSIM_UDP_TUNNEL_N_PORTS; debugfs_create_u32_array("table0", 0400, ns->udp_ports.ddir, &ns->udp_ports.dfs_ports[0]); ns->udp_ports.dfs_ports[1].array = ns->udp_ports.ports[1]; ns->udp_ports.dfs_ports[1].n_elements = NSIM_UDP_TUNNEL_N_PORTS; debugfs_create_u32_array("table1", 0400, ns->udp_ports.ddir, &ns->udp_ports.dfs_ports[1]); debugfs_create_file("reset", 0200, ns->udp_ports.ddir, dev, &nsim_udp_tunnels_info_reset_fops); /* Note: it's not normal to allocate the info struct like this! * Drivers are expected to use a static const one, here we're testing. */ info = kmemdup(&nsim_udp_tunnel_info, sizeof(nsim_udp_tunnel_info), GFP_KERNEL); if (!info) return -ENOMEM; if (nsim_dev->udp_ports.sync_all) { info->set_port = NULL; info->unset_port = NULL; } else { info->sync_table = NULL; } if (nsim_dev->udp_ports.open_only) info->flags |= UDP_TUNNEL_NIC_INFO_OPEN_ONLY; if (nsim_dev->udp_ports.ipv4_only) info->flags |= UDP_TUNNEL_NIC_INFO_IPV4_ONLY; if (nsim_dev->udp_ports.shared) info->shared = &nsim_dev->udp_ports.utn_shared; if (nsim_dev->udp_ports.static_iana_vxlan) info->flags |= UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN; dev->udp_tunnel_nic_info = info; return 0; } void nsim_udp_tunnels_info_destroy(struct net_device *dev) { struct netdevsim *ns = netdev_priv(dev); debugfs_remove_recursive(ns->udp_ports.ddir); kfree(dev->udp_tunnel_nic_info); dev->udp_tunnel_nic_info = NULL; } void nsim_udp_tunnels_debugfs_create(struct nsim_dev *nsim_dev) { debugfs_create_bool("udp_ports_sync_all", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.sync_all); debugfs_create_bool("udp_ports_open_only", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.open_only); debugfs_create_bool("udp_ports_ipv4_only", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.ipv4_only); debugfs_create_bool("udp_ports_shared", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.shared); debugfs_create_bool("udp_ports_static_iana_vxlan", 0600, nsim_dev->ddir, &nsim_dev->udp_ports.static_iana_vxlan); }
3661 3664 3666 19 3664 165 97 108 242 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause) /* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * * SipHash: a fast short-input PRF * https://131002.net/siphash/ * * This implementation is specifically for SipHash2-4 for a secure PRF * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for * hashtables. */ #include <linux/siphash.h> #include <linux/unaligned.h> #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 #include <linux/dcache.h> #include <asm/word-at-a-time.h> #endif #define SIPROUND SIPHASH_PERMUTATION(v0, v1, v2, v3) #define PREAMBLE(len) \ u64 v0 = SIPHASH_CONST_0; \ u64 v1 = SIPHASH_CONST_1; \ u64 v2 = SIPHASH_CONST_2; \ u64 v3 = SIPHASH_CONST_3; \ u64 b = ((u64)(len)) << 56; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define POSTAMBLE \ v3 ^= b; \ SIPROUND; \ SIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ SIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_aligned); #endif u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; PREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; SIPROUND; SIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif POSTAMBLE } EXPORT_SYMBOL(__siphash_unaligned); /** * siphash_1u64 - compute 64-bit siphash PRF value of a u64 * @first: first u64 * @key: the siphash key */ u64 siphash_1u64(const u64 first, const siphash_key_t *key) { PREAMBLE(8) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u64); /** * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64 * @first: first u64 * @second: second u64 * @key: the siphash key */ u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key) { PREAMBLE(16) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; POSTAMBLE } EXPORT_SYMBOL(siphash_2u64); /** * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @key: the siphash key */ u64 siphash_3u64(const u64 first, const u64 second, const u64 third, const siphash_key_t *key) { PREAMBLE(24) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u64); /** * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64 * @first: first u64 * @second: second u64 * @third: third u64 * @forth: forth u64 * @key: the siphash key */ u64 siphash_4u64(const u64 first, const u64 second, const u64 third, const u64 forth, const siphash_key_t *key) { PREAMBLE(32) v3 ^= first; SIPROUND; SIPROUND; v0 ^= first; v3 ^= second; SIPROUND; SIPROUND; v0 ^= second; v3 ^= third; SIPROUND; SIPROUND; v0 ^= third; v3 ^= forth; SIPROUND; SIPROUND; v0 ^= forth; POSTAMBLE } EXPORT_SYMBOL(siphash_4u64); u64 siphash_1u32(const u32 first, const siphash_key_t *key) { PREAMBLE(4) b |= first; POSTAMBLE } EXPORT_SYMBOL(siphash_1u32); u64 siphash_3u32(const u32 first, const u32 second, const u32 third, const siphash_key_t *key) { u64 combined = (u64)second << 32 | first; PREAMBLE(12) v3 ^= combined; SIPROUND; SIPROUND; v0 ^= combined; b |= third; POSTAMBLE } EXPORT_SYMBOL(siphash_3u32); #if BITS_PER_LONG == 64 /* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3. */ #define HSIPROUND SIPROUND #define HPREAMBLE(len) PREAMBLE(len) #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = le64_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= le32_to_cpup(data); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); const u8 left = len & (sizeof(u64) - 1); u64 m; HPREAMBLE(len) for (; data != end; data += sizeof(u64)) { m = get_unaligned_le64(data); v3 ^= m; HSIPROUND; v0 ^= m; } #if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 if (left) b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & bytemask_from_count(left))); #else switch (left) { case 7: b |= ((u64)end[6]) << 48; fallthrough; case 6: b |= ((u64)end[5]) << 40; fallthrough; case 5: b |= ((u64)end[4]) << 32; fallthrough; case 4: b |= get_unaligned_le32(end); break; case 3: b |= ((u64)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } #endif HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) b |= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(8) v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(12) v3 ^= combined; HSIPROUND; v0 ^= combined; b |= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { u64 combined = (u64)second << 32 | first; HPREAMBLE(16) v3 ^= combined; HSIPROUND; v0 ^= combined; combined = (u64)forth << 32 | third; v3 ^= combined; HSIPROUND; v0 ^= combined; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #else #define HSIPROUND HSIPHASH_PERMUTATION(v0, v1, v2, v3) #define HPREAMBLE(len) \ u32 v0 = HSIPHASH_CONST_0; \ u32 v1 = HSIPHASH_CONST_1; \ u32 v2 = HSIPHASH_CONST_2; \ u32 v3 = HSIPHASH_CONST_3; \ u32 b = ((u32)(len)) << 24; \ v3 ^= key->key[1]; \ v2 ^= key->key[0]; \ v1 ^= key->key[1]; \ v0 ^= key->key[0]; #define HPOSTAMBLE \ v3 ^= b; \ HSIPROUND; \ v0 ^= b; \ v2 ^= 0xff; \ HSIPROUND; \ HSIPROUND; \ HSIPROUND; \ return v1 ^ v3; #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = le32_to_cpup(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= le16_to_cpup(data); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); #endif u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); const u8 left = len & (sizeof(u32) - 1); u32 m; HPREAMBLE(len) for (; data != end; data += sizeof(u32)) { m = get_unaligned_le32(data); v3 ^= m; HSIPROUND; v0 ^= m; } switch (left) { case 3: b |= ((u32)end[2]) << 16; fallthrough; case 2: b |= get_unaligned_le16(end); break; case 1: b |= end[0]; } HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); /** * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 * @first: first u32 * @key: the hsiphash key */ u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) { HPREAMBLE(4) v3 ^= first; HSIPROUND; v0 ^= first; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_1u32); /** * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 * @first: first u32 * @second: second u32 * @key: the hsiphash key */ u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) { HPREAMBLE(8) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_2u32); /** * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @key: the hsiphash key */ u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, const hsiphash_key_t *key) { HPREAMBLE(12) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_3u32); /** * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 * @first: first u32 * @second: second u32 * @third: third u32 * @forth: forth u32 * @key: the hsiphash key */ u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, const u32 forth, const hsiphash_key_t *key) { HPREAMBLE(16) v3 ^= first; HSIPROUND; v0 ^= first; v3 ^= second; HSIPROUND; v0 ^= second; v3 ^= third; HSIPROUND; v0 ^= third; v3 ^= forth; HSIPROUND; v0 ^= forth; HPOSTAMBLE } EXPORT_SYMBOL(hsiphash_4u32); #endif
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 // SPDX-License-Identifier: GPL-2.0-only /* * scsi_sysfs.c * * SCSI sysfs interface routines. * * Created to pull SCSI mid layer sysfs routines into one file. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/device.h> #include <linux/pm_runtime.h> #include <linux/bsg.h> #include <scsi/scsi.h> #include <scsi/scsi_device.h> #include <scsi/scsi_host.h> #include <scsi/scsi_tcq.h> #include <scsi/scsi_dh.h> #include <scsi/scsi_transport.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_devinfo.h> #include "scsi_priv.h" #include "scsi_logging.h" static const struct device_type scsi_dev_type; static const struct { enum scsi_device_state value; char *name; } sdev_states[] = { { SDEV_CREATED, "created" }, { SDEV_RUNNING, "running" }, { SDEV_CANCEL, "cancel" }, { SDEV_DEL, "deleted" }, { SDEV_QUIESCE, "quiesce" }, { SDEV_OFFLINE, "offline" }, { SDEV_TRANSPORT_OFFLINE, "transport-offline" }, { SDEV_BLOCK, "blocked" }, { SDEV_CREATED_BLOCK, "created-blocked" }, }; const char *scsi_device_state_name(enum scsi_device_state state) { int i; char *name = NULL; for (i = 0; i < ARRAY_SIZE(sdev_states); i++) { if (sdev_states[i].value == state) { name = sdev_states[i].name; break; } } return name; } static const struct { enum scsi_host_state value; char *name; } shost_states[] = { { SHOST_CREATED, "created" }, { SHOST_RUNNING, "running" }, { SHOST_CANCEL, "cancel" }, { SHOST_DEL, "deleted" }, { SHOST_RECOVERY, "recovery" }, { SHOST_CANCEL_RECOVERY, "cancel/recovery" }, { SHOST_DEL_RECOVERY, "deleted/recovery", }, }; const char *scsi_host_state_name(enum scsi_host_state state) { int i; char *name = NULL; for (i = 0; i < ARRAY_SIZE(shost_states); i++) { if (shost_states[i].value == state) { name = shost_states[i].name; break; } } return name; } #ifdef CONFIG_SCSI_DH static const struct { unsigned char value; char *name; } sdev_access_states[] = { { SCSI_ACCESS_STATE_OPTIMAL, "active/optimized" }, { SCSI_ACCESS_STATE_ACTIVE, "active/non-optimized" }, { SCSI_ACCESS_STATE_STANDBY, "standby" }, { SCSI_ACCESS_STATE_UNAVAILABLE, "unavailable" }, { SCSI_ACCESS_STATE_LBA, "lba-dependent" }, { SCSI_ACCESS_STATE_OFFLINE, "offline" }, { SCSI_ACCESS_STATE_TRANSITIONING, "transitioning" }, }; static const char *scsi_access_state_name(unsigned char state) { int i; char *name = NULL; for (i = 0; i < ARRAY_SIZE(sdev_access_states); i++) { if (sdev_access_states[i].value == state) { name = sdev_access_states[i].name; break; } } return name; } #endif static int check_set(unsigned long long *val, char *src) { char *last; if (strcmp(src, "-") == 0) { *val = SCAN_WILD_CARD; } else { /* * Doesn't check for int overflow */ *val = simple_strtoull(src, &last, 0); if (*last != '\0') return 1; } return 0; } static int scsi_scan(struct Scsi_Host *shost, const char *str) { char s1[15], s2[15], s3[17], junk; unsigned long long channel, id, lun; int res; res = sscanf(str, "%10s %10s %16s %c", s1, s2, s3, &junk); if (res != 3) return -EINVAL; if (check_set(&channel, s1)) return -EINVAL; if (check_set(&id, s2)) return -EINVAL; if (check_set(&lun, s3)) return -EINVAL; if (shost->transportt->user_scan) res = shost->transportt->user_scan(shost, channel, id, lun); else res = scsi_scan_host_selected(shost, channel, id, lun, SCSI_SCAN_MANUAL); return res; } /* * shost_show_function: macro to create an attr function that can be used to * show a non-bit field. */ #define shost_show_function(name, field, format_string) \ static ssize_t \ show_##name (struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ struct Scsi_Host *shost = class_to_shost(dev); \ return snprintf (buf, 20, format_string, shost->field); \ } /* * shost_rd_attr: macro to create a function and attribute variable for a * read only field. */ #define shost_rd_attr2(name, field, format_string) \ shost_show_function(name, field, format_string) \ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL); #define shost_rd_attr(field, format_string) \ shost_rd_attr2(field, field, format_string) /* * Create the actual show/store functions and data structures. */ static ssize_t store_scan(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct Scsi_Host *shost = class_to_shost(dev); int res; res = scsi_scan(shost, buf); if (res == 0) res = count; return res; }; static DEVICE_ATTR(scan, S_IWUSR, NULL, store_scan); static ssize_t store_shost_state(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int i; struct Scsi_Host *shost = class_to_shost(dev); enum scsi_host_state state = 0; for (i = 0; i < ARRAY_SIZE(shost_states); i++) { const int len = strlen(shost_states[i].name); if (strncmp(shost_states[i].name, buf, len) == 0 && buf[len] == '\n') { state = shost_states[i].value; break; } } if (!state) return -EINVAL; if (scsi_host_set_state(shost, state)) return -EINVAL; return count; } static ssize_t show_shost_state(struct device *dev, struct device_attribute *attr, char *buf) { struct Scsi_Host *shost = class_to_shost(dev); const char *name = scsi_host_state_name(shost->shost_state); if (!name) return -EINVAL; return snprintf(buf, 20, "%s\n", name); } /* DEVICE_ATTR(state) clashes with dev_attr_state for sdev */ static struct device_attribute dev_attr_hstate = __ATTR(state, S_IRUGO | S_IWUSR, show_shost_state, store_shost_state); static ssize_t show_shost_mode(unsigned int mode, char *buf) { ssize_t len = 0; if (mode & MODE_INITIATOR) len = sprintf(buf, "%s", "Initiator"); if (mode & MODE_TARGET) len += sprintf(buf + len, "%s%s", len ? ", " : "", "Target"); len += sprintf(buf + len, "\n"); return len; } static ssize_t show_shost_supported_mode(struct device *dev, struct device_attribute *attr, char *buf) { struct Scsi_Host *shost = class_to_shost(dev); unsigned int supported_mode = shost->hostt->supported_mode; if (supported_mode == MODE_UNKNOWN) /* by default this should be initiator */ supported_mode = MODE_INITIATOR; return show_shost_mode(supported_mode, buf); } static DEVICE_ATTR(supported_mode, S_IRUGO, show_shost_supported_mode, NULL); static ssize_t show_shost_active_mode(struct device *dev, struct device_attribute *attr, char *buf) { struct Scsi_Host *shost = class_to_shost(dev); if (shost->active_mode == MODE_UNKNOWN) return snprintf(buf, 20, "unknown\n"); else return show_shost_mode(shost->active_mode, buf); } static DEVICE_ATTR(active_mode, S_IRUGO, show_shost_active_mode, NULL); static int check_reset_type(const char *str) { if (sysfs_streq(str, "adapter")) return SCSI_ADAPTER_RESET; else if (sysfs_streq(str, "firmware")) return SCSI_FIRMWARE_RESET; else return 0; } static ssize_t store_host_reset(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct Scsi_Host *shost = class_to_shost(dev); const struct scsi_host_template *sht = shost->hostt; int ret = -EINVAL; int type; type = check_reset_type(buf); if (!type) goto exit_store_host_reset; if (sht->host_reset) ret = sht->host_reset(shost, type); else ret = -EOPNOTSUPP; exit_store_host_reset: if (ret == 0) ret = count; return ret; } static DEVICE_ATTR(host_reset, S_IWUSR, NULL, store_host_reset); static ssize_t show_shost_eh_deadline(struct device *dev, struct device_attribute *attr, char *buf) { struct Scsi_Host *shost = class_to_shost(dev); if (shost->eh_deadline == -1) return snprintf(buf, strlen("off") + 2, "off\n"); return sprintf(buf, "%u\n", shost->eh_deadline / HZ); } static ssize_t store_shost_eh_deadline(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct Scsi_Host *shost = class_to_shost(dev); int ret = -EINVAL; unsigned long deadline, flags; if (shost->transportt && (shost->transportt->eh_strategy_handler || !shost->hostt->eh_host_reset_handler)) return ret; if (!strncmp(buf, "off", strlen("off"))) deadline = -1; else { ret = kstrtoul(buf, 10, &deadline); if (ret) return ret; if (deadline * HZ > UINT_MAX) return -EINVAL; } spin_lock_irqsave(shost->host_lock, flags); if (scsi_host_in_recovery(shost)) ret = -EBUSY; else { if (deadline == -1) shost->eh_deadline = -1; else shost->eh_deadline = deadline * HZ; ret = count; } spin_unlock_irqrestore(shost->host_lock, flags); return ret; } static DEVICE_ATTR(eh_deadline, S_IRUGO | S_IWUSR, show_shost_eh_deadline, store_shost_eh_deadline); shost_rd_attr(unique_id, "%u\n"); shost_rd_attr(cmd_per_lun, "%hd\n"); shost_rd_attr(can_queue, "%d\n"); shost_rd_attr(sg_tablesize, "%hu\n"); shost_rd_attr(sg_prot_tablesize, "%hu\n"); shost_rd_attr(prot_capabilities, "%u\n"); shost_rd_attr(prot_guard_type, "%hd\n"); shost_rd_attr2(proc_name, hostt->proc_name, "%s\n"); static ssize_t show_host_busy(struct device *dev, struct device_attribute *attr, char *buf) { struct Scsi_Host *shost = class_to_shost(dev); return snprintf(buf, 20, "%d\n", scsi_host_busy(shost)); } static DEVICE_ATTR(host_busy, S_IRUGO, show_host_busy, NULL); static ssize_t show_use_blk_mq(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "1\n"); } static DEVICE_ATTR(use_blk_mq, S_IRUGO, show_use_blk_mq, NULL); static ssize_t show_nr_hw_queues(struct device *dev, struct device_attribute *attr, char *buf) { struct Scsi_Host *shost = class_to_shost(dev); struct blk_mq_tag_set *tag_set = &shost->tag_set; return snprintf(buf, 20, "%d\n", tag_set->nr_hw_queues); } static DEVICE_ATTR(nr_hw_queues, S_IRUGO, show_nr_hw_queues, NULL); static struct attribute *scsi_sysfs_shost_attrs[] = { &dev_attr_use_blk_mq.attr, &dev_attr_unique_id.attr, &dev_attr_host_busy.attr, &dev_attr_cmd_per_lun.attr, &dev_attr_can_queue.attr, &dev_attr_sg_tablesize.attr, &dev_attr_sg_prot_tablesize.attr, &dev_attr_proc_name.attr, &dev_attr_scan.attr, &dev_attr_hstate.attr, &dev_attr_supported_mode.attr, &dev_attr_active_mode.attr, &dev_attr_prot_capabilities.attr, &dev_attr_prot_guard_type.attr, &dev_attr_host_reset.attr, &dev_attr_eh_deadline.attr, &dev_attr_nr_hw_queues.attr, NULL }; static const struct attribute_group scsi_shost_attr_group = { .attrs = scsi_sysfs_shost_attrs, }; const struct attribute_group *scsi_shost_groups[] = { &scsi_shost_attr_group, NULL }; static void scsi_device_cls_release(struct device *class_dev) { struct scsi_device *sdev; sdev = class_to_sdev(class_dev); put_device(&sdev->sdev_gendev); } static void scsi_device_dev_release(struct device *dev) { struct scsi_device *sdev = to_scsi_device(dev); struct device *parent; struct list_head *this, *tmp; struct scsi_vpd *vpd_pg80 = NULL, *vpd_pg83 = NULL; struct scsi_vpd *vpd_pg0 = NULL, *vpd_pg89 = NULL; struct scsi_vpd *vpd_pgb0 = NULL, *vpd_pgb1 = NULL, *vpd_pgb2 = NULL; struct scsi_vpd *vpd_pgb7 = NULL; unsigned long flags; might_sleep(); scsi_dh_release_device(sdev); parent = sdev->sdev_gendev.parent; spin_lock_irqsave(sdev->host->host_lock, flags); list_del(&sdev->siblings); list_del(&sdev->same_target_siblings); list_del(&sdev->starved_entry); spin_unlock_irqrestore(sdev->host->host_lock, flags); cancel_work_sync(&sdev->event_work); list_for_each_safe(this, tmp, &sdev->event_list) { struct scsi_event *evt; evt = list_entry(this, struct scsi_event, node); list_del(&evt->node); kfree(evt); } blk_put_queue(sdev->request_queue); /* NULL queue means the device can't be used */ sdev->request_queue = NULL; sbitmap_free(&sdev->budget_map); mutex_lock(&sdev->inquiry_mutex); vpd_pg0 = rcu_replace_pointer(sdev->vpd_pg0, vpd_pg0, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pg80 = rcu_replace_pointer(sdev->vpd_pg80, vpd_pg80, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pg83 = rcu_replace_pointer(sdev->vpd_pg83, vpd_pg83, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pg89 = rcu_replace_pointer(sdev->vpd_pg89, vpd_pg89, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pgb0 = rcu_replace_pointer(sdev->vpd_pgb0, vpd_pgb0, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pgb1 = rcu_replace_pointer(sdev->vpd_pgb1, vpd_pgb1, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pgb2 = rcu_replace_pointer(sdev->vpd_pgb2, vpd_pgb2, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pgb7 = rcu_replace_pointer(sdev->vpd_pgb7, vpd_pgb7, lockdep_is_held(&sdev->inquiry_mutex)); mutex_unlock(&sdev->inquiry_mutex); if (vpd_pg0) kfree_rcu(vpd_pg0, rcu); if (vpd_pg83) kfree_rcu(vpd_pg83, rcu); if (vpd_pg80) kfree_rcu(vpd_pg80, rcu); if (vpd_pg89) kfree_rcu(vpd_pg89, rcu); if (vpd_pgb0) kfree_rcu(vpd_pgb0, rcu); if (vpd_pgb1) kfree_rcu(vpd_pgb1, rcu); if (vpd_pgb2) kfree_rcu(vpd_pgb2, rcu); if (vpd_pgb7) kfree_rcu(vpd_pgb7, rcu); kfree(sdev->inquiry); kfree(sdev); if (parent) put_device(parent); } static struct class sdev_class = { .name = "scsi_device", .dev_release = scsi_device_cls_release, }; /* all probing is done in the individual ->probe routines */ static int scsi_bus_match(struct device *dev, const struct device_driver *gendrv) { struct scsi_device *sdp; if (dev->type != &scsi_dev_type) return 0; sdp = to_scsi_device(dev); if (sdp->no_uld_attach) return 0; return (sdp->inq_periph_qual == SCSI_INQ_PQ_CON)? 1: 0; } static int scsi_bus_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct scsi_device *sdev; if (dev->type != &scsi_dev_type) return 0; sdev = to_scsi_device(dev); add_uevent_var(env, "MODALIAS=" SCSI_DEVICE_MODALIAS_FMT, sdev->type); return 0; } const struct bus_type scsi_bus_type = { .name = "scsi", .match = scsi_bus_match, .uevent = scsi_bus_uevent, #ifdef CONFIG_PM .pm = &scsi_bus_pm_ops, #endif }; int scsi_sysfs_register(void) { int error; error = bus_register(&scsi_bus_type); if (!error) { error = class_register(&sdev_class); if (error) bus_unregister(&scsi_bus_type); } return error; } void scsi_sysfs_unregister(void) { class_unregister(&sdev_class); bus_unregister(&scsi_bus_type); } /* * sdev_show_function: macro to create an attr function that can be used to * show a non-bit field. */ #define sdev_show_function(field, format_string) \ static ssize_t \ sdev_show_##field (struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ struct scsi_device *sdev; \ sdev = to_scsi_device(dev); \ return snprintf (buf, 20, format_string, sdev->field); \ } \ /* * sdev_rd_attr: macro to create a function and attribute variable for a * read only field. */ #define sdev_rd_attr(field, format_string) \ sdev_show_function(field, format_string) \ static DEVICE_ATTR(field, S_IRUGO, sdev_show_##field, NULL); /* * Create the actual show/store functions and data structures. */ sdev_rd_attr (type, "%d\n"); sdev_rd_attr (scsi_level, "%d\n"); sdev_rd_attr (vendor, "%.8s\n"); sdev_rd_attr (model, "%.16s\n"); sdev_rd_attr (rev, "%.4s\n"); sdev_rd_attr (cdl_supported, "%d\n"); static ssize_t sdev_show_device_busy(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); return snprintf(buf, 20, "%d\n", scsi_device_busy(sdev)); } static DEVICE_ATTR(device_busy, S_IRUGO, sdev_show_device_busy, NULL); static ssize_t sdev_show_device_blocked(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); return snprintf(buf, 20, "%d\n", atomic_read(&sdev->device_blocked)); } static DEVICE_ATTR(device_blocked, S_IRUGO, sdev_show_device_blocked, NULL); /* * TODO: can we make these symlinks to the block layer ones? */ static ssize_t sdev_show_timeout (struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev; sdev = to_scsi_device(dev); return snprintf(buf, 20, "%d\n", sdev->request_queue->rq_timeout / HZ); } static ssize_t sdev_store_timeout (struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_device *sdev = to_scsi_device(dev); int ret, timeout; ret = kstrtoint(buf, 0, &timeout); if (ret) return ret; if (timeout <= 0) return -EINVAL; blk_queue_rq_timeout(sdev->request_queue, timeout * HZ); return count; } static DEVICE_ATTR(timeout, S_IRUGO | S_IWUSR, sdev_show_timeout, sdev_store_timeout); static ssize_t sdev_show_eh_timeout(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev; sdev = to_scsi_device(dev); return snprintf(buf, 20, "%u\n", sdev->eh_timeout / HZ); } static ssize_t sdev_store_eh_timeout(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_device *sdev; unsigned int eh_timeout; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; sdev = to_scsi_device(dev); err = kstrtouint(buf, 10, &eh_timeout); if (err) return err; sdev->eh_timeout = eh_timeout * HZ; return count; } static DEVICE_ATTR(eh_timeout, S_IRUGO | S_IWUSR, sdev_show_eh_timeout, sdev_store_eh_timeout); static ssize_t store_rescan_field (struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { scsi_rescan_device(to_scsi_device(dev)); return count; } static DEVICE_ATTR(rescan, S_IWUSR, NULL, store_rescan_field); static ssize_t sdev_store_delete(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct kernfs_node *kn; struct scsi_device *sdev = to_scsi_device(dev); /* * We need to try to get module, avoiding the module been removed * during delete. */ if (scsi_device_get(sdev)) return -ENODEV; kn = sysfs_break_active_protection(&dev->kobj, &attr->attr); WARN_ON_ONCE(!kn); /* * Concurrent writes into the "delete" sysfs attribute may trigger * concurrent calls to device_remove_file() and scsi_remove_device(). * device_remove_file() handles concurrent removal calls by * serializing these and by ignoring the second and later removal * attempts. Concurrent calls of scsi_remove_device() are * serialized. The second and later calls of scsi_remove_device() are * ignored because the first call of that function changes the device * state into SDEV_DEL. */ device_remove_file(dev, attr); scsi_remove_device(sdev); if (kn) sysfs_unbreak_active_protection(kn); scsi_device_put(sdev); return count; }; static DEVICE_ATTR(delete, S_IWUSR, NULL, sdev_store_delete); static ssize_t store_state_field(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int i, ret; struct scsi_device *sdev = to_scsi_device(dev); enum scsi_device_state state = 0; bool rescan_dev = false; for (i = 0; i < ARRAY_SIZE(sdev_states); i++) { const int len = strlen(sdev_states[i].name); if (strncmp(sdev_states[i].name, buf, len) == 0 && buf[len] == '\n') { state = sdev_states[i].value; break; } } switch (state) { case SDEV_RUNNING: case SDEV_OFFLINE: break; default: return -EINVAL; } mutex_lock(&sdev->state_mutex); switch (sdev->sdev_state) { case SDEV_RUNNING: case SDEV_OFFLINE: break; default: mutex_unlock(&sdev->state_mutex); return -EINVAL; } if (sdev->sdev_state == SDEV_RUNNING && state == SDEV_RUNNING) { ret = 0; } else { ret = scsi_device_set_state(sdev, state); if (ret == 0 && state == SDEV_RUNNING) rescan_dev = true; } mutex_unlock(&sdev->state_mutex); if (rescan_dev) { /* * If the device state changes to SDEV_RUNNING, we need to * run the queue to avoid I/O hang, and rescan the device * to revalidate it. Running the queue first is necessary * because another thread may be waiting inside * blk_mq_freeze_queue_wait() and because that call may be * waiting for pending I/O to finish. */ blk_mq_run_hw_queues(sdev->request_queue, true); scsi_rescan_device(sdev); } return ret == 0 ? count : -EINVAL; } static ssize_t show_state_field(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); const char *name = scsi_device_state_name(sdev->sdev_state); if (!name) return -EINVAL; return snprintf(buf, 20, "%s\n", name); } static DEVICE_ATTR(state, S_IRUGO | S_IWUSR, show_state_field, store_state_field); static ssize_t show_queue_type_field(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); const char *name = "none"; if (sdev->simple_tags) name = "simple"; return snprintf(buf, 20, "%s\n", name); } static ssize_t store_queue_type_field(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_device *sdev = to_scsi_device(dev); if (!sdev->tagged_supported) return -EINVAL; sdev_printk(KERN_INFO, sdev, "ignoring write to deprecated queue_type attribute"); return count; } static DEVICE_ATTR(queue_type, S_IRUGO | S_IWUSR, show_queue_type_field, store_queue_type_field); #define sdev_vpd_pg_attr(_page) \ static ssize_t \ show_vpd_##_page(struct file *filp, struct kobject *kobj, \ const struct bin_attribute *bin_attr, \ char *buf, loff_t off, size_t count) \ { \ struct device *dev = kobj_to_dev(kobj); \ struct scsi_device *sdev = to_scsi_device(dev); \ struct scsi_vpd *vpd_page; \ int ret = -EINVAL; \ \ rcu_read_lock(); \ vpd_page = rcu_dereference(sdev->vpd_##_page); \ if (vpd_page) \ ret = memory_read_from_buffer(buf, count, &off, \ vpd_page->data, vpd_page->len); \ rcu_read_unlock(); \ return ret; \ } \ static const struct bin_attribute dev_attr_vpd_##_page = { \ .attr = {.name = __stringify(vpd_##_page), .mode = S_IRUGO }, \ .size = 0, \ .read = show_vpd_##_page, \ }; sdev_vpd_pg_attr(pg83); sdev_vpd_pg_attr(pg80); sdev_vpd_pg_attr(pg89); sdev_vpd_pg_attr(pgb0); sdev_vpd_pg_attr(pgb1); sdev_vpd_pg_attr(pgb2); sdev_vpd_pg_attr(pgb7); sdev_vpd_pg_attr(pg0); static ssize_t show_inquiry(struct file *filep, struct kobject *kobj, const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); struct scsi_device *sdev = to_scsi_device(dev); if (!sdev->inquiry) return -EINVAL; return memory_read_from_buffer(buf, count, &off, sdev->inquiry, sdev->inquiry_len); } static const struct bin_attribute dev_attr_inquiry = { .attr = { .name = "inquiry", .mode = S_IRUGO, }, .size = 0, .read = show_inquiry, }; static ssize_t show_iostat_counterbits(struct device *dev, struct device_attribute *attr, char *buf) { return snprintf(buf, 20, "%d\n", (int)sizeof(atomic_t) * 8); } static DEVICE_ATTR(iocounterbits, S_IRUGO, show_iostat_counterbits, NULL); #define show_sdev_iostat(field) \ static ssize_t \ show_iostat_##field(struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ struct scsi_device *sdev = to_scsi_device(dev); \ unsigned long long count = atomic_read(&sdev->field); \ return snprintf(buf, 20, "0x%llx\n", count); \ } \ static DEVICE_ATTR(field, S_IRUGO, show_iostat_##field, NULL) show_sdev_iostat(iorequest_cnt); show_sdev_iostat(iodone_cnt); show_sdev_iostat(ioerr_cnt); show_sdev_iostat(iotmo_cnt); static ssize_t sdev_show_modalias(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev; sdev = to_scsi_device(dev); return snprintf (buf, 20, SCSI_DEVICE_MODALIAS_FMT "\n", sdev->type); } static DEVICE_ATTR(modalias, S_IRUGO, sdev_show_modalias, NULL); #define DECLARE_EVT_SHOW(name, Cap_name) \ static ssize_t \ sdev_show_evt_##name(struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ struct scsi_device *sdev = to_scsi_device(dev); \ int val = test_bit(SDEV_EVT_##Cap_name, sdev->supported_events);\ return snprintf(buf, 20, "%d\n", val); \ } #define DECLARE_EVT_STORE(name, Cap_name) \ static ssize_t \ sdev_store_evt_##name(struct device *dev, struct device_attribute *attr,\ const char *buf, size_t count) \ { \ struct scsi_device *sdev = to_scsi_device(dev); \ int val = simple_strtoul(buf, NULL, 0); \ if (val == 0) \ clear_bit(SDEV_EVT_##Cap_name, sdev->supported_events); \ else if (val == 1) \ set_bit(SDEV_EVT_##Cap_name, sdev->supported_events); \ else \ return -EINVAL; \ return count; \ } #define DECLARE_EVT(name, Cap_name) \ DECLARE_EVT_SHOW(name, Cap_name) \ DECLARE_EVT_STORE(name, Cap_name) \ static DEVICE_ATTR(evt_##name, S_IRUGO, sdev_show_evt_##name, \ sdev_store_evt_##name); #define REF_EVT(name) &dev_attr_evt_##name.attr DECLARE_EVT(media_change, MEDIA_CHANGE) DECLARE_EVT(inquiry_change_reported, INQUIRY_CHANGE_REPORTED) DECLARE_EVT(capacity_change_reported, CAPACITY_CHANGE_REPORTED) DECLARE_EVT(soft_threshold_reached, SOFT_THRESHOLD_REACHED_REPORTED) DECLARE_EVT(mode_parameter_change_reported, MODE_PARAMETER_CHANGE_REPORTED) DECLARE_EVT(lun_change_reported, LUN_CHANGE_REPORTED) static ssize_t sdev_store_queue_depth(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int depth, retval; struct scsi_device *sdev = to_scsi_device(dev); const struct scsi_host_template *sht = sdev->host->hostt; if (!sht->change_queue_depth) return -EINVAL; depth = simple_strtoul(buf, NULL, 0); if (depth < 1 || depth > sdev->host->can_queue) return -EINVAL; retval = sht->change_queue_depth(sdev, depth); if (retval < 0) return retval; sdev->max_queue_depth = sdev->queue_depth; return count; } sdev_show_function(queue_depth, "%d\n"); static DEVICE_ATTR(queue_depth, S_IRUGO | S_IWUSR, sdev_show_queue_depth, sdev_store_queue_depth); static ssize_t sdev_show_wwid(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); ssize_t count; count = scsi_vpd_lun_id(sdev, buf, PAGE_SIZE); if (count > 0) { buf[count] = '\n'; count++; } return count; } static DEVICE_ATTR(wwid, S_IRUGO, sdev_show_wwid, NULL); #define BLIST_FLAG_NAME(name) \ [const_ilog2((__force __u64)BLIST_##name)] = #name static const char *const sdev_bflags_name[] = { #include "scsi_devinfo_tbl.c" }; #undef BLIST_FLAG_NAME static ssize_t sdev_show_blacklist(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); int i; ssize_t len = 0; for (i = 0; i < sizeof(sdev->sdev_bflags) * BITS_PER_BYTE; i++) { const char *name = NULL; if (!(sdev->sdev_bflags & (__force blist_flags_t)BIT(i))) continue; if (i < ARRAY_SIZE(sdev_bflags_name) && sdev_bflags_name[i]) name = sdev_bflags_name[i]; if (name) len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? " " : "", name); else len += scnprintf(buf + len, PAGE_SIZE - len, "%sINVALID_BIT(%d)", len ? " " : "", i); } if (len) len += scnprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } static DEVICE_ATTR(blacklist, S_IRUGO, sdev_show_blacklist, NULL); #ifdef CONFIG_SCSI_DH static ssize_t sdev_show_dh_state(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); if (!sdev->handler) return snprintf(buf, 20, "detached\n"); return snprintf(buf, 20, "%s\n", sdev->handler->name); } static ssize_t sdev_store_dh_state(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_device *sdev = to_scsi_device(dev); int err = -EINVAL; if (sdev->sdev_state == SDEV_CANCEL || sdev->sdev_state == SDEV_DEL) return -ENODEV; if (!sdev->handler) { /* * Attach to a device handler */ err = scsi_dh_attach(sdev->request_queue, buf); } else if (!strncmp(buf, "activate", 8)) { /* * Activate a device handler */ if (sdev->handler->activate) err = sdev->handler->activate(sdev, NULL, NULL); else err = 0; } else if (!strncmp(buf, "detach", 6)) { /* * Detach from a device handler */ sdev_printk(KERN_WARNING, sdev, "can't detach handler %s.\n", sdev->handler->name); err = -EINVAL; } return err < 0 ? err : count; } static DEVICE_ATTR(dh_state, S_IRUGO | S_IWUSR, sdev_show_dh_state, sdev_store_dh_state); static ssize_t sdev_show_access_state(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); unsigned char access_state; const char *access_state_name; if (!sdev->handler) return -EINVAL; access_state = (sdev->access_state & SCSI_ACCESS_STATE_MASK); access_state_name = scsi_access_state_name(access_state); return sprintf(buf, "%s\n", access_state_name ? access_state_name : "unknown"); } static DEVICE_ATTR(access_state, S_IRUGO, sdev_show_access_state, NULL); static ssize_t sdev_show_preferred_path(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); if (!sdev->handler) return -EINVAL; if (sdev->access_state & SCSI_ACCESS_STATE_PREFERRED) return sprintf(buf, "1\n"); else return sprintf(buf, "0\n"); } static DEVICE_ATTR(preferred_path, S_IRUGO, sdev_show_preferred_path, NULL); #endif static ssize_t sdev_show_queue_ramp_up_period(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev; sdev = to_scsi_device(dev); return snprintf(buf, 20, "%u\n", jiffies_to_msecs(sdev->queue_ramp_up_period)); } static ssize_t sdev_store_queue_ramp_up_period(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_device *sdev = to_scsi_device(dev); unsigned int period; if (kstrtouint(buf, 10, &period)) return -EINVAL; sdev->queue_ramp_up_period = msecs_to_jiffies(period); return count; } static DEVICE_ATTR(queue_ramp_up_period, S_IRUGO | S_IWUSR, sdev_show_queue_ramp_up_period, sdev_store_queue_ramp_up_period); static ssize_t sdev_show_cdl_enable(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); return sysfs_emit(buf, "%d\n", (int)sdev->cdl_enable); } static ssize_t sdev_store_cdl_enable(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ret; bool v; if (kstrtobool(buf, &v)) return -EINVAL; ret = scsi_cdl_enable(to_scsi_device(dev), v); if (ret) return ret; return count; } static DEVICE_ATTR(cdl_enable, S_IRUGO | S_IWUSR, sdev_show_cdl_enable, sdev_store_cdl_enable); static umode_t scsi_sdev_attr_is_visible(struct kobject *kobj, struct attribute *attr, int i) { struct device *dev = kobj_to_dev(kobj); struct scsi_device *sdev = to_scsi_device(dev); if (attr == &dev_attr_queue_depth.attr && !sdev->host->hostt->change_queue_depth) return S_IRUGO; if (attr == &dev_attr_queue_ramp_up_period.attr && !sdev->host->hostt->change_queue_depth) return 0; return attr->mode; } static umode_t scsi_sdev_bin_attr_is_visible(struct kobject *kobj, const struct bin_attribute *attr, int i) { struct device *dev = kobj_to_dev(kobj); struct scsi_device *sdev = to_scsi_device(dev); if (attr == &dev_attr_vpd_pg0 && !sdev->vpd_pg0) return 0; if (attr == &dev_attr_vpd_pg80 && !sdev->vpd_pg80) return 0; if (attr == &dev_attr_vpd_pg83 && !sdev->vpd_pg83) return 0; if (attr == &dev_attr_vpd_pg89 && !sdev->vpd_pg89) return 0; if (attr == &dev_attr_vpd_pgb0 && !sdev->vpd_pgb0) return 0; if (attr == &dev_attr_vpd_pgb1 && !sdev->vpd_pgb1) return 0; if (attr == &dev_attr_vpd_pgb2 && !sdev->vpd_pgb2) return 0; if (attr == &dev_attr_vpd_pgb7 && !sdev->vpd_pgb7) return 0; return S_IRUGO; } /* Default template for device attributes. May NOT be modified */ static struct attribute *scsi_sdev_attrs[] = { &dev_attr_device_blocked.attr, &dev_attr_type.attr, &dev_attr_scsi_level.attr, &dev_attr_device_busy.attr, &dev_attr_vendor.attr, &dev_attr_model.attr, &dev_attr_rev.attr, &dev_attr_rescan.attr, &dev_attr_delete.attr, &dev_attr_state.attr, &dev_attr_timeout.attr, &dev_attr_eh_timeout.attr, &dev_attr_iocounterbits.attr, &dev_attr_iorequest_cnt.attr, &dev_attr_iodone_cnt.attr, &dev_attr_ioerr_cnt.attr, &dev_attr_iotmo_cnt.attr, &dev_attr_modalias.attr, &dev_attr_queue_depth.attr, &dev_attr_queue_type.attr, &dev_attr_wwid.attr, &dev_attr_blacklist.attr, #ifdef CONFIG_SCSI_DH &dev_attr_dh_state.attr, &dev_attr_access_state.attr, &dev_attr_preferred_path.attr, #endif &dev_attr_queue_ramp_up_period.attr, &dev_attr_cdl_supported.attr, &dev_attr_cdl_enable.attr, REF_EVT(media_change), REF_EVT(inquiry_change_reported), REF_EVT(capacity_change_reported), REF_EVT(soft_threshold_reached), REF_EVT(mode_parameter_change_reported), REF_EVT(lun_change_reported), NULL }; static const struct bin_attribute *const scsi_sdev_bin_attrs[] = { &dev_attr_vpd_pg0, &dev_attr_vpd_pg83, &dev_attr_vpd_pg80, &dev_attr_vpd_pg89, &dev_attr_vpd_pgb0, &dev_attr_vpd_pgb1, &dev_attr_vpd_pgb2, &dev_attr_vpd_pgb7, &dev_attr_inquiry, NULL }; static struct attribute_group scsi_sdev_attr_group = { .attrs = scsi_sdev_attrs, .bin_attrs = scsi_sdev_bin_attrs, .is_visible = scsi_sdev_attr_is_visible, .is_bin_visible = scsi_sdev_bin_attr_is_visible, }; static const struct attribute_group *scsi_sdev_attr_groups[] = { &scsi_sdev_attr_group, NULL }; static int scsi_target_add(struct scsi_target *starget) { int error; if (starget->state != STARGET_CREATED) return 0; error = device_add(&starget->dev); if (error) { dev_err(&starget->dev, "target device_add failed, error %d\n", error); return error; } transport_add_device(&starget->dev); starget->state = STARGET_RUNNING; pm_runtime_set_active(&starget->dev); pm_runtime_enable(&starget->dev); device_enable_async_suspend(&starget->dev); return 0; } /** * scsi_sysfs_add_sdev - add scsi device to sysfs * @sdev: scsi_device to add * * Return value: * 0 on Success / non-zero on Failure **/ int scsi_sysfs_add_sdev(struct scsi_device *sdev) { int error; struct scsi_target *starget = sdev->sdev_target; if (WARN_ON_ONCE(scsi_device_is_pseudo_dev(sdev))) return -EINVAL; error = scsi_target_add(starget); if (error) return error; transport_configure_device(&starget->dev); device_enable_async_suspend(&sdev->sdev_gendev); scsi_autopm_get_target(starget); pm_runtime_set_active(&sdev->sdev_gendev); if (!sdev->rpm_autosuspend) pm_runtime_forbid(&sdev->sdev_gendev); pm_runtime_enable(&sdev->sdev_gendev); scsi_autopm_put_target(starget); scsi_autopm_get_device(sdev); scsi_dh_add_device(sdev); error = device_add(&sdev->sdev_gendev); if (error) { sdev_printk(KERN_INFO, sdev, "failed to add device: %d\n", error); return error; } device_enable_async_suspend(&sdev->sdev_dev); error = device_add(&sdev->sdev_dev); if (error) { sdev_printk(KERN_INFO, sdev, "failed to add class device: %d\n", error); device_del(&sdev->sdev_gendev); return error; } transport_add_device(&sdev->sdev_gendev); sdev->is_visible = 1; if (IS_ENABLED(CONFIG_BLK_DEV_BSG)) { sdev->bsg_dev = scsi_bsg_register_queue(sdev); if (IS_ERR(sdev->bsg_dev)) { error = PTR_ERR(sdev->bsg_dev); sdev_printk(KERN_INFO, sdev, "Failed to register bsg queue, errno=%d\n", error); sdev->bsg_dev = NULL; } } scsi_autopm_put_device(sdev); return error; } void __scsi_remove_device(struct scsi_device *sdev) { struct device *dev = &sdev->sdev_gendev; int res; /* * This cleanup path is not reentrant and while it is impossible * to get a new reference with scsi_device_get() someone can still * hold a previously acquired one. */ if (sdev->sdev_state == SDEV_DEL) return; if (sdev->is_visible) { /* * If scsi_internal_target_block() is running concurrently, * wait until it has finished before changing the device state. */ mutex_lock(&sdev->state_mutex); /* * If blocked, we go straight to DEL and restart the queue so * any commands issued during driver shutdown (like sync * cache) are errored immediately. */ res = scsi_device_set_state(sdev, SDEV_CANCEL); if (res != 0) { res = scsi_device_set_state(sdev, SDEV_DEL); if (res == 0) scsi_start_queue(sdev); } mutex_unlock(&sdev->state_mutex); if (res != 0) return; if (IS_ENABLED(CONFIG_BLK_DEV_BSG) && sdev->bsg_dev) bsg_unregister_queue(sdev->bsg_dev); device_unregister(&sdev->sdev_dev); transport_remove_device(dev); device_del(dev); } else put_device(&sdev->sdev_dev); /* * Stop accepting new requests and wait until all queuecommand() and * scsi_run_queue() invocations have finished before tearing down the * device. */ mutex_lock(&sdev->state_mutex); scsi_device_set_state(sdev, SDEV_DEL); mutex_unlock(&sdev->state_mutex); blk_mq_destroy_queue(sdev->request_queue); kref_put(&sdev->host->tagset_refcnt, scsi_mq_free_tags); cancel_work_sync(&sdev->requeue_work); if (!scsi_device_is_pseudo_dev(sdev) && sdev->host->hostt->sdev_destroy) sdev->host->hostt->sdev_destroy(sdev); transport_destroy_device(dev); /* * Paired with the kref_get() in scsi_sysfs_initialize(). We have * removed sysfs visibility from the device, so make the target * invisible if this was the last device underneath it. */ scsi_target_reap(scsi_target(sdev)); put_device(dev); } /** * scsi_remove_device - unregister a device from the scsi bus * @sdev: scsi_device to unregister **/ void scsi_remove_device(struct scsi_device *sdev) { struct Scsi_Host *shost = sdev->host; mutex_lock(&shost->scan_mutex); __scsi_remove_device(sdev); mutex_unlock(&shost->scan_mutex); } EXPORT_SYMBOL(scsi_remove_device); static void __scsi_remove_target(struct scsi_target *starget) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); unsigned long flags; struct scsi_device *sdev; spin_lock_irqsave(shost->host_lock, flags); restart: list_for_each_entry(sdev, &shost->__devices, siblings) { /* * We cannot call scsi_device_get() here, as * we might've been called from rmmod() causing * scsi_device_get() to fail the module_is_live() * check. */ if (sdev->channel != starget->channel || sdev->id != starget->id) continue; if (sdev->sdev_state == SDEV_DEL || sdev->sdev_state == SDEV_CANCEL || !get_device(&sdev->sdev_gendev)) continue; spin_unlock_irqrestore(shost->host_lock, flags); scsi_remove_device(sdev); put_device(&sdev->sdev_gendev); spin_lock_irqsave(shost->host_lock, flags); goto restart; } spin_unlock_irqrestore(shost->host_lock, flags); } /** * scsi_remove_target - try to remove a target and all its devices * @dev: generic starget or parent of generic stargets to be removed * * Note: This is slightly racy. It is possible that if the user * requests the addition of another device then the target won't be * removed. */ void scsi_remove_target(struct device *dev) { struct Scsi_Host *shost = dev_to_shost(dev->parent); struct scsi_target *starget; unsigned long flags; restart: spin_lock_irqsave(shost->host_lock, flags); list_for_each_entry(starget, &shost->__targets, siblings) { if (starget->state == STARGET_DEL || starget->state == STARGET_REMOVE || starget->state == STARGET_CREATED_REMOVE) continue; if (starget->dev.parent == dev || &starget->dev == dev) { kref_get(&starget->reap_ref); if (starget->state == STARGET_CREATED) starget->state = STARGET_CREATED_REMOVE; else starget->state = STARGET_REMOVE; spin_unlock_irqrestore(shost->host_lock, flags); __scsi_remove_target(starget); scsi_target_reap(starget); goto restart; } } spin_unlock_irqrestore(shost->host_lock, flags); } EXPORT_SYMBOL(scsi_remove_target); int __scsi_register_driver(struct device_driver *drv, struct module *owner) { drv->bus = &scsi_bus_type; drv->owner = owner; return driver_register(drv); } EXPORT_SYMBOL(__scsi_register_driver); int scsi_register_interface(struct class_interface *intf) { intf->class = &sdev_class; return class_interface_register(intf); } EXPORT_SYMBOL(scsi_register_interface); /** * scsi_sysfs_add_host - add scsi host to subsystem * @shost: scsi host struct to add to subsystem **/ int scsi_sysfs_add_host(struct Scsi_Host *shost) { transport_register_device(&shost->shost_gendev); transport_configure_device(&shost->shost_gendev); return 0; } static const struct device_type scsi_dev_type = { .name = "scsi_device", .release = scsi_device_dev_release, .groups = scsi_sdev_attr_groups, }; void scsi_sysfs_device_initialize(struct scsi_device *sdev) { unsigned long flags; struct Scsi_Host *shost = sdev->host; const struct scsi_host_template *hostt = shost->hostt; struct scsi_target *starget = sdev->sdev_target; device_initialize(&sdev->sdev_gendev); sdev->sdev_gendev.bus = &scsi_bus_type; sdev->sdev_gendev.type = &scsi_dev_type; scsi_enable_async_suspend(&sdev->sdev_gendev); dev_set_name(&sdev->sdev_gendev, "%d:%d:%d:%llu", sdev->host->host_no, sdev->channel, sdev->id, sdev->lun); sdev->sdev_gendev.groups = hostt->sdev_groups; device_initialize(&sdev->sdev_dev); sdev->sdev_dev.parent = get_device(&sdev->sdev_gendev); sdev->sdev_dev.class = &sdev_class; dev_set_name(&sdev->sdev_dev, "%d:%d:%d:%llu", sdev->host->host_no, sdev->channel, sdev->id, sdev->lun); /* * Get a default scsi_level from the target (derived from sibling * devices). This is the best we can do for guessing how to set * sdev->lun_in_cdb for the initial INQUIRY command. For LUN 0 the * setting doesn't matter, because all the bits are zero anyway. * But it does matter for higher LUNs. */ sdev->scsi_level = starget->scsi_level; if (sdev->scsi_level <= SCSI_2 && sdev->scsi_level != SCSI_UNKNOWN && !shost->no_scsi2_lun_in_cdb) sdev->lun_in_cdb = 1; transport_setup_device(&sdev->sdev_gendev); spin_lock_irqsave(shost->host_lock, flags); list_add_tail(&sdev->same_target_siblings, &starget->devices); list_add_tail(&sdev->siblings, &shost->__devices); spin_unlock_irqrestore(shost->host_lock, flags); /* * device can now only be removed via __scsi_remove_device() so hold * the target. Target will be held in CREATED state until something * beneath it becomes visible (in which case it moves to RUNNING) */ kref_get(&starget->reap_ref); } int scsi_is_sdev_device(const struct device *dev) { return dev->type == &scsi_dev_type; } EXPORT_SYMBOL(scsi_is_sdev_device); /* A blank transport template that is used in drivers that don't * yet implement Transport Attributes */ struct scsi_transport_template blank_transport_template = { { { {NULL, }, }, }, };
20 11 21 6 11 11 15 25 16 15 25 25 25 7 7 7 7 7 7 23 23 23 23 23 7 7 21 7 7 9 9 9 7 7 7 7 7 7 7 7 7 7 7 2 2 2 21 21 21 21 21 21 20 21 21 8 8 7 4 3 1 3 20 20 20 20 20 20 20 20 42 35 35 35 42 25 24 23 23 23 1 22 22 1 21 21 21 21 21 21 20 19 16 16 16 15 15 14 2 13 15 15 15 1 15 15 15 15 15 14 1 1 1 1 5 7 7 7 7 7 6 5 5 5 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 // SPDX-License-Identifier: GPL-2.0-or-later /* * Userspace interface * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/netpoll.h> #include <linux/ethtool.h> #include <linux/if_arp.h> #include <linux/module.h> #include <linux/init.h> #include <linux/rtnetlink.h> #include <linux/if_ether.h> #include <linux/slab.h> #include <net/dsa.h> #include <net/sock.h> #include <linux/if_vlan.h> #include <net/switchdev.h> #include <net/net_namespace.h> #include "br_private.h" /* * Determine initial path cost based on speed. * using recommendations from 802.1d standard * * Since driver might sleep need to not be holding any locks. */ static int port_cost(struct net_device *dev) { struct ethtool_link_ksettings ecmd; if (!__ethtool_get_link_ksettings(dev, &ecmd)) { switch (ecmd.base.speed) { case SPEED_10000: return 2; case SPEED_5000: return 3; case SPEED_2500: return 4; case SPEED_1000: return 5; case SPEED_100: return 19; case SPEED_10: return 100; case SPEED_UNKNOWN: return 100; default: if (ecmd.base.speed > SPEED_10000) return 1; } } /* Old silly heuristics based on name */ if (!strncmp(dev->name, "lec", 3)) return 7; if (!strncmp(dev->name, "plip", 4)) return 2500; return 100; /* assume old 10Mbps */ } /* Check for port carrier transitions. */ void br_port_carrier_check(struct net_bridge_port *p, bool *notified) { struct net_device *dev = p->dev; struct net_bridge *br = p->br; if (!(p->flags & BR_ADMIN_COST) && netif_running(dev) && netif_oper_up(dev)) p->path_cost = port_cost(dev); *notified = false; if (!netif_running(br->dev)) return; spin_lock_bh(&br->lock); if (netif_running(dev) && netif_oper_up(dev)) { if (p->state == BR_STATE_DISABLED) { br_stp_enable_port(p); *notified = true; } } else { if (p->state != BR_STATE_DISABLED) { br_stp_disable_port(p); *notified = true; } } spin_unlock_bh(&br->lock); } static void br_port_set_promisc(struct net_bridge_port *p) { int err = 0; if (br_promisc_port(p)) return; err = dev_set_promiscuity(p->dev, 1); if (err) return; br_fdb_unsync_static(p->br, p); p->flags |= BR_PROMISC; } static void br_port_clear_promisc(struct net_bridge_port *p) { int err; /* Check if the port is already non-promisc or if it doesn't * support UNICAST filtering. Without unicast filtering support * we'll end up re-enabling promisc mode anyway, so just check for * it here. */ if (!br_promisc_port(p) || !(p->dev->priv_flags & IFF_UNICAST_FLT)) return; /* Since we'll be clearing the promisc mode, program the port * first so that we don't have interruption in traffic. */ err = br_fdb_sync_static(p->br, p); if (err) return; dev_set_promiscuity(p->dev, -1); p->flags &= ~BR_PROMISC; } /* When a port is added or removed or when certain port flags * change, this function is called to automatically manage * promiscuity setting of all the bridge ports. We are always called * under RTNL so can skip using rcu primitives. */ void br_manage_promisc(struct net_bridge *br) { struct net_bridge_port *p; bool set_all = false; /* If vlan filtering is disabled or bridge interface is placed * into promiscuous mode, place all ports in promiscuous mode. */ if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br->dev)) set_all = true; list_for_each_entry(p, &br->port_list, list) { if (set_all) { br_port_set_promisc(p); } else { /* If the number of auto-ports is <= 1, then all other * ports will have their output configuration * statically specified through fdbs. Since ingress * on the auto-port becomes forwarding/egress to other * ports and egress configuration is statically known, * we can say that ingress configuration of the * auto-port is also statically known. * This lets us disable promiscuous mode and write * this config to hw. */ if ((p->dev->priv_flags & IFF_UNICAST_FLT) && (br->auto_cnt == 0 || (br->auto_cnt == 1 && br_auto_port(p)))) br_port_clear_promisc(p); else br_port_set_promisc(p); } } } int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev) { struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port); struct net_bridge_port *backup_p = NULL; ASSERT_RTNL(); if (backup_dev) { if (!netif_is_bridge_port(backup_dev)) return -ENOENT; backup_p = br_port_get_rtnl(backup_dev); if (backup_p->br != p->br) return -EINVAL; } if (p == backup_p) return -EINVAL; if (old_backup == backup_p) return 0; /* if the backup link is already set, clear it */ if (old_backup) old_backup->backup_redirected_cnt--; if (backup_p) backup_p->backup_redirected_cnt++; rcu_assign_pointer(p->backup_port, backup_p); return 0; } static void nbp_backup_clear(struct net_bridge_port *p) { nbp_backup_change(p, NULL); if (p->backup_redirected_cnt) { struct net_bridge_port *cur_p; list_for_each_entry(cur_p, &p->br->port_list, list) { struct net_bridge_port *backup_p; backup_p = rtnl_dereference(cur_p->backup_port); if (backup_p == p) nbp_backup_change(cur_p, NULL); } } WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt); } static void nbp_update_port_count(struct net_bridge *br) { struct net_bridge_port *p; u32 cnt = 0; list_for_each_entry(p, &br->port_list, list) { if (br_auto_port(p)) cnt++; } if (br->auto_cnt != cnt) { br->auto_cnt = cnt; br_manage_promisc(br); } } static void nbp_delete_promisc(struct net_bridge_port *p) { /* If port is currently promiscuous, unset promiscuity. * Otherwise, it is a static port so remove all addresses * from it. */ dev_set_allmulti(p->dev, -1); if (br_promisc_port(p)) dev_set_promiscuity(p->dev, -1); else br_fdb_unsync_static(p->br, p); } static void release_nbp(struct kobject *kobj) { struct net_bridge_port *p = container_of(kobj, struct net_bridge_port, kobj); kfree(p); } static void brport_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { struct net_bridge_port *p = kobj_to_brport(kobj); net_ns_get_ownership(dev_net(p->dev), uid, gid); } static const struct kobj_type brport_ktype = { #ifdef CONFIG_SYSFS .sysfs_ops = &brport_sysfs_ops, #endif .release = release_nbp, .get_ownership = brport_get_ownership, }; static void destroy_nbp(struct net_bridge_port *p) { struct net_device *dev = p->dev; p->br = NULL; p->dev = NULL; netdev_put(dev, &p->dev_tracker); kobject_put(&p->kobj); } static void destroy_nbp_rcu(struct rcu_head *head) { struct net_bridge_port *p = container_of(head, struct net_bridge_port, rcu); destroy_nbp(p); } static unsigned get_max_headroom(struct net_bridge *br) { unsigned max_headroom = 0; struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { unsigned dev_headroom = netdev_get_fwd_headroom(p->dev); if (dev_headroom > max_headroom) max_headroom = dev_headroom; } return max_headroom; } static void update_headroom(struct net_bridge *br, int new_hr) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) netdev_set_rx_headroom(p->dev, new_hr); br->dev->needed_headroom = new_hr; } /* Delete port(interface) from bridge is done in two steps. * via RCU. First step, marks device as down. That deletes * all the timers and stops new packets from flowing through. * * Final cleanup doesn't occur until after all CPU's finished * processing packets. * * Protected from multiple admin operations by RTNL mutex */ static void del_nbp(struct net_bridge_port *p) { struct net_bridge *br = p->br; struct net_device *dev = p->dev; sysfs_remove_link(br->ifobj, p->dev->name); nbp_delete_promisc(p); spin_lock_bh(&br->lock); br_stp_disable_port(p); spin_unlock_bh(&br->lock); br_mrp_port_del(br, p); br_cfm_port_del(br, p); br_ifinfo_notify(RTM_DELLINK, NULL, p); list_del_rcu(&p->list); if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom) update_headroom(br, get_max_headroom(br)); netdev_reset_rx_headroom(dev); nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); switchdev_deferred_process(); nbp_backup_clear(p); nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); br_multicast_del_port(p); kobject_uevent(&p->kobj, KOBJ_REMOVE); kobject_del(&p->kobj); br_netpoll_disable(p); call_rcu(&p->rcu, destroy_nbp_rcu); } /* Delete bridge device */ void br_dev_delete(struct net_device *dev, struct list_head *head) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p, *n; list_for_each_entry_safe(p, n, &br->port_list, list) { del_nbp(p); } br_mst_uninit(br); br_recalculate_neigh_suppress_enabled(br); br_fdb_delete_by_port(br, NULL, 0, 1); cancel_delayed_work_sync(&br->gc_work); br_sysfs_delbr(br->dev); unregister_netdevice_queue(br->dev, head); } /* find an available port number */ static int find_portno(struct net_bridge *br) { int index; struct net_bridge_port *p; unsigned long *inuse; inuse = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); if (!inuse) return -ENOMEM; __set_bit(0, inuse); /* zero is reserved */ list_for_each_entry(p, &br->port_list, list) __set_bit(p->port_no, inuse); index = find_first_zero_bit(inuse, BR_MAX_PORTS); bitmap_free(inuse); return (index >= BR_MAX_PORTS) ? -EXFULL : index; } /* called with RTNL but without bridge lock */ static struct net_bridge_port *new_nbp(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int index, err; index = find_portno(br); if (index < 0) return ERR_PTR(index); p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return ERR_PTR(-ENOMEM); p->br = br; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; p->port_no = index; p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; br_init_port(p); br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); err = br_multicast_add_port(p); if (err) { netdev_put(dev, &p->dev_tracker); kfree(p); p = ERR_PTR(err); } return p; } int br_add_bridge(struct net *net, const char *name) { struct net_device *dev; int res; dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN, br_dev_setup); if (!dev) return -ENOMEM; dev_net_set(dev, net); dev->rtnl_link_ops = &br_link_ops; res = register_netdevice(dev); if (res) free_netdev(dev); return res; } int br_del_bridge(struct net *net, const char *name) { struct net_device *dev; int ret = 0; dev = __dev_get_by_name(net, name); if (dev == NULL) ret = -ENXIO; /* Could not find device */ else if (!netif_is_bridge_master(dev)) { /* Attempt to delete non bridge device! */ ret = -EPERM; } else if (dev->flags & IFF_UP) { /* Not shutdown yet. */ ret = -EBUSY; } else br_dev_delete(dev, NULL); return ret; } /* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */ static int br_mtu_min(const struct net_bridge *br) { const struct net_bridge_port *p; int ret_mtu = 0; list_for_each_entry(p, &br->port_list, list) if (!ret_mtu || ret_mtu > p->dev->mtu) ret_mtu = p->dev->mtu; return ret_mtu ? ret_mtu : ETH_DATA_LEN; } void br_mtu_auto_adjust(struct net_bridge *br) { ASSERT_RTNL(); /* if the bridge MTU was manually configured don't mess with it */ if (br_opt_get(br, BROPT_MTU_SET_BY_USER)) return; /* change to the minimum MTU and clear the flag which was set by * the bridge ndo_change_mtu callback */ dev_set_mtu(br->dev, br_mtu_min(br)); br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false); } /* * Recomputes features using slave's features */ netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features) { struct net_bridge_port *p; netdev_features_t mask; if (list_empty(&br->port_list)) return features; mask = features; features &= ~NETIF_F_ONE_FOR_ALL; list_for_each_entry(p, &br->port_list, list) { features = netdev_increment_features(features, p->dev->features, mask); } features = netdev_add_tso_features(features, mask); return features; } /* called with RTNL */ int br_add_if(struct net_bridge *br, struct net_device *dev, struct netlink_ext_ack *extack) { struct net_bridge_port *p; int err = 0; unsigned br_hr, dev_hr; bool changed_addr, fdb_synced = false; /* Don't allow bridging non-ethernet like devices. */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) return -EINVAL; /* No bridging of bridges */ if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) { NL_SET_ERR_MSG(extack, "Can not enslave a bridge to a bridge"); return -ELOOP; } /* Device has master upper dev */ if (netdev_master_upper_dev_get(dev)) return -EBUSY; /* No bridging devices that dislike that (e.g. wireless) */ if (dev->priv_flags & IFF_DONT_BRIDGE) { NL_SET_ERR_MSG(extack, "Device does not allow enslaving to a bridge"); return -EOPNOTSUPP; } p = new_nbp(br, dev); if (IS_ERR(p)) return PTR_ERR(p); call_netdevice_notifiers(NETDEV_JOIN, dev); err = dev_set_allmulti(dev, 1); if (err) { br_multicast_del_port(p); netdev_put(dev, &p->dev_tracker); kfree(p); /* kobject not yet init'd, manually free */ goto err1; } err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj), SYSFS_BRIDGE_PORT_ATTR); if (err) goto err2; err = br_sysfs_addif(p); if (err) goto err2; err = br_netpoll_enable(p); if (err) goto err3; err = netdev_rx_handler_register(dev, br_get_rx_handler(dev), p); if (err) goto err4; dev->priv_flags |= IFF_BRIDGE_PORT; err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack); if (err) goto err5; dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); nbp_update_port_count(br); if (!br_promisc_port(p) && (p->dev->priv_flags & IFF_UNICAST_FLT)) { /* When updating the port count we also update all ports' * promiscuous mode. * A port leaving promiscuous mode normally gets the bridge's * fdb synced to the unicast filter (if supported), however, * `br_port_clear_promisc` does not distinguish between * non-promiscuous ports and *new* ports, so we need to * sync explicitly here. */ fdb_synced = br_fdb_sync_static(br, p) == 0; if (!fdb_synced) netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n"); } br_hr = br->dev->needed_headroom; dev_hr = netdev_get_fwd_headroom(dev); if (br_hr < dev_hr) update_headroom(br, dev_hr); else netdev_set_rx_headroom(dev, br_hr); if (br_fdb_add_local(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); if (br->dev->addr_assign_type != NET_ADDR_SET) { /* Ask for permission to use this MAC address now, even if we * don't end up choosing it below. */ err = netif_pre_changeaddr_notify(br->dev, dev->dev_addr, extack); if (err) goto err6; } err = nbp_vlan_init(p, extack); if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); goto err6; } spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); if (netif_running(dev) && netif_oper_up(dev) && (br->dev->flags & IFF_UP)) br_stp_enable_port(p); spin_unlock_bh(&br->lock); br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); br_mtu_auto_adjust(br); netdev_compute_master_upper_features(br->dev, false); kobject_uevent(&p->kobj, KOBJ_ADD); return 0; err6: if (fdb_synced) br_fdb_unsync_static(br, p); list_del_rcu(&p->list); br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); err5: dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); err4: br_netpoll_disable(p); err3: sysfs_remove_link(br->ifobj, p->dev->name); err2: br_multicast_del_port(p); netdev_put(dev, &p->dev_tracker); kobject_put(&p->kobj); dev_set_allmulti(dev, -1); err1: return err; } /* called with RTNL */ int br_del_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; bool changed_addr; p = br_port_get_rtnl(dev); if (!p || p->br != br) return -EINVAL; /* Since more than one interface can be attached to a bridge, * there still maybe an alternate path for netconsole to use; * therefore there is no reason for a NETDEV_RELEASE event. */ del_nbp(p); br_mtu_auto_adjust(br); spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); netdev_compute_master_upper_features(br->dev, false); return 0; } void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) { struct net_bridge *br = p->br; if (mask & BR_AUTO_MASK) nbp_update_port_count(br); if (mask & (BR_NEIGH_SUPPRESS | BR_NEIGH_VLAN_SUPPRESS)) br_recalculate_neigh_suppress_enabled(br); } bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag) { struct net_bridge_port *p; p = br_port_get_rtnl_rcu(dev); if (!p) return false; return p->flags & flag; } EXPORT_SYMBOL_GPL(br_port_flag_is_set);
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 /****************************************************************************** * * Module Name: hwgpe - Low level GPE enable/disable/clear functions * * Copyright (C) 2000 - 2025, Intel Corp. * *****************************************************************************/ #include <acpi/acpi.h> #include "accommon.h" #include "acevents.h" #define _COMPONENT ACPI_HARDWARE ACPI_MODULE_NAME("hwgpe") #if (!ACPI_REDUCED_HARDWARE) /* Entire module */ /* Local prototypes */ static acpi_status acpi_hw_enable_wakeup_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, struct acpi_gpe_block_info *gpe_block, void *context); static acpi_status acpi_hw_gpe_enable_write(u8 enable_mask, struct acpi_gpe_register_info *gpe_register_info); /****************************************************************************** * * FUNCTION: acpi_hw_gpe_read * * PARAMETERS: value - Where the value is returned * reg - GPE register structure * * RETURN: Status * * DESCRIPTION: Read from a GPE register in either memory or IO space. * * LIMITATIONS: <These limitations also apply to acpi_hw_gpe_write> * space_ID must be system_memory or system_IO. * ******************************************************************************/ acpi_status acpi_hw_gpe_read(u64 *value, struct acpi_gpe_address *reg) { acpi_status status; u32 value32; if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) { #ifdef ACPI_GPE_USE_LOGICAL_ADDRESSES *value = (u64)ACPI_GET8((unsigned long)reg->address); return_ACPI_STATUS(AE_OK); #else return acpi_os_read_memory((acpi_physical_address)reg->address, value, ACPI_GPE_REGISTER_WIDTH); #endif } status = acpi_os_read_port((acpi_io_address)reg->address, &value32, ACPI_GPE_REGISTER_WIDTH); if (ACPI_FAILURE(status)) return_ACPI_STATUS(status); *value = (u64)value32; return_ACPI_STATUS(AE_OK); } /****************************************************************************** * * FUNCTION: acpi_hw_gpe_write * * PARAMETERS: value - Value to be written * reg - GPE register structure * * RETURN: Status * * DESCRIPTION: Write to a GPE register in either memory or IO space. * ******************************************************************************/ acpi_status acpi_hw_gpe_write(u64 value, struct acpi_gpe_address *reg) { if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) { #ifdef ACPI_GPE_USE_LOGICAL_ADDRESSES ACPI_SET8((unsigned long)reg->address, value); return_ACPI_STATUS(AE_OK); #else return acpi_os_write_memory((acpi_physical_address)reg->address, value, ACPI_GPE_REGISTER_WIDTH); #endif } return acpi_os_write_port((acpi_io_address)reg->address, (u32)value, ACPI_GPE_REGISTER_WIDTH); } /****************************************************************************** * * FUNCTION: acpi_hw_get_gpe_register_bit * * PARAMETERS: gpe_event_info - Info block for the GPE * * RETURN: Register mask with a one in the GPE bit position * * DESCRIPTION: Compute the register mask for this GPE. One bit is set in the * correct position for the input GPE. * ******************************************************************************/ u32 acpi_hw_get_gpe_register_bit(struct acpi_gpe_event_info *gpe_event_info) { return ((u32)1 << (gpe_event_info->gpe_number - gpe_event_info->register_info->base_gpe_number)); } /****************************************************************************** * * FUNCTION: acpi_hw_low_set_gpe * * PARAMETERS: gpe_event_info - Info block for the GPE to be disabled * action - Enable or disable * * RETURN: Status * * DESCRIPTION: Enable or disable a single GPE in the parent enable register. * The enable_mask field of the involved GPE register must be * updated by the caller if necessary. * ******************************************************************************/ acpi_status acpi_hw_low_set_gpe(struct acpi_gpe_event_info *gpe_event_info, u32 action) { struct acpi_gpe_register_info *gpe_register_info; acpi_status status = AE_OK; u64 enable_mask; u32 register_bit; ACPI_FUNCTION_ENTRY(); /* Get the info block for the entire GPE register */ gpe_register_info = gpe_event_info->register_info; if (!gpe_register_info) { return (AE_NOT_EXIST); } /* Get current value of the enable register that contains this GPE */ status = acpi_hw_gpe_read(&enable_mask, &gpe_register_info->enable_address); if (ACPI_FAILURE(status)) { return (status); } /* Set or clear just the bit that corresponds to this GPE */ register_bit = acpi_hw_get_gpe_register_bit(gpe_event_info); switch (action) { case ACPI_GPE_CONDITIONAL_ENABLE: /* Only enable if the corresponding enable_mask bit is set */ if (!(register_bit & gpe_register_info->enable_mask)) { return (AE_BAD_PARAMETER); } ACPI_FALLTHROUGH; case ACPI_GPE_ENABLE: ACPI_SET_BIT(enable_mask, register_bit); break; case ACPI_GPE_DISABLE: ACPI_CLEAR_BIT(enable_mask, register_bit); break; default: ACPI_ERROR((AE_INFO, "Invalid GPE Action, %u", action)); return (AE_BAD_PARAMETER); } if (!(register_bit & gpe_register_info->mask_for_run)) { /* Write the updated enable mask */ status = acpi_hw_gpe_write(enable_mask, &gpe_register_info->enable_address); } return (status); } /****************************************************************************** * * FUNCTION: acpi_hw_clear_gpe * * PARAMETERS: gpe_event_info - Info block for the GPE to be cleared * * RETURN: Status * * DESCRIPTION: Clear the status bit for a single GPE. * ******************************************************************************/ acpi_status acpi_hw_clear_gpe(struct acpi_gpe_event_info *gpe_event_info) { struct acpi_gpe_register_info *gpe_register_info; acpi_status status; u32 register_bit; ACPI_FUNCTION_ENTRY(); /* Get the info block for the entire GPE register */ gpe_register_info = gpe_event_info->register_info; if (!gpe_register_info) { return (AE_NOT_EXIST); } /* * Write a one to the appropriate bit in the status register to * clear this GPE. */ register_bit = acpi_hw_get_gpe_register_bit(gpe_event_info); status = acpi_hw_gpe_write(register_bit, &gpe_register_info->status_address); return (status); } /****************************************************************************** * * FUNCTION: acpi_hw_get_gpe_status * * PARAMETERS: gpe_event_info - Info block for the GPE to queried * event_status - Where the GPE status is returned * * RETURN: Status * * DESCRIPTION: Return the status of a single GPE. * ******************************************************************************/ acpi_status acpi_hw_get_gpe_status(struct acpi_gpe_event_info *gpe_event_info, acpi_event_status *event_status) { u64 in_byte; u32 register_bit; struct acpi_gpe_register_info *gpe_register_info; acpi_event_status local_event_status = 0; acpi_status status; ACPI_FUNCTION_ENTRY(); if (!event_status) { return (AE_BAD_PARAMETER); } /* GPE currently handled? */ if (ACPI_GPE_DISPATCH_TYPE(gpe_event_info->flags) != ACPI_GPE_DISPATCH_NONE) { local_event_status |= ACPI_EVENT_FLAG_HAS_HANDLER; } /* Get the info block for the entire GPE register */ gpe_register_info = gpe_event_info->register_info; /* Get the register bitmask for this GPE */ register_bit = acpi_hw_get_gpe_register_bit(gpe_event_info); /* GPE currently enabled? (enabled for runtime?) */ if (register_bit & gpe_register_info->enable_for_run) { local_event_status |= ACPI_EVENT_FLAG_ENABLED; } /* GPE currently masked? (masked for runtime?) */ if (register_bit & gpe_register_info->mask_for_run) { local_event_status |= ACPI_EVENT_FLAG_MASKED; } /* GPE enabled for wake? */ if (register_bit & gpe_register_info->enable_for_wake) { local_event_status |= ACPI_EVENT_FLAG_WAKE_ENABLED; } /* GPE currently enabled (enable bit == 1)? */ status = acpi_hw_gpe_read(&in_byte, &gpe_register_info->enable_address); if (ACPI_FAILURE(status)) { return (status); } if (register_bit & in_byte) { local_event_status |= ACPI_EVENT_FLAG_ENABLE_SET; } /* GPE currently active (status bit == 1)? */ status = acpi_hw_gpe_read(&in_byte, &gpe_register_info->status_address); if (ACPI_FAILURE(status)) { return (status); } if (register_bit & in_byte) { local_event_status |= ACPI_EVENT_FLAG_STATUS_SET; } /* Set return value */ (*event_status) = local_event_status; return (AE_OK); } /****************************************************************************** * * FUNCTION: acpi_hw_gpe_enable_write * * PARAMETERS: enable_mask - Bit mask to write to the GPE register * gpe_register_info - Gpe Register info * * RETURN: Status * * DESCRIPTION: Write the enable mask byte to the given GPE register. * ******************************************************************************/ static acpi_status acpi_hw_gpe_enable_write(u8 enable_mask, struct acpi_gpe_register_info *gpe_register_info) { acpi_status status; gpe_register_info->enable_mask = enable_mask; status = acpi_hw_gpe_write(enable_mask, &gpe_register_info->enable_address); return (status); } /****************************************************************************** * * FUNCTION: acpi_hw_disable_gpe_block * * PARAMETERS: gpe_xrupt_info - GPE Interrupt info * gpe_block - Gpe Block info * * RETURN: Status * * DESCRIPTION: Disable all GPEs within a single GPE block * ******************************************************************************/ acpi_status acpi_hw_disable_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, struct acpi_gpe_block_info *gpe_block, void *context) { u32 i; acpi_status status; /* Examine each GPE Register within the block */ for (i = 0; i < gpe_block->register_count; i++) { /* Disable all GPEs in this register */ status = acpi_hw_gpe_enable_write(0x00, &gpe_block->register_info[i]); if (ACPI_FAILURE(status)) { return (status); } } return (AE_OK); } /****************************************************************************** * * FUNCTION: acpi_hw_clear_gpe_block * * PARAMETERS: gpe_xrupt_info - GPE Interrupt info * gpe_block - Gpe Block info * * RETURN: Status * * DESCRIPTION: Clear status bits for all GPEs within a single GPE block * ******************************************************************************/ acpi_status acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, struct acpi_gpe_block_info *gpe_block, void *context) { u32 i; acpi_status status; /* Examine each GPE Register within the block */ for (i = 0; i < gpe_block->register_count; i++) { /* Clear status on all GPEs in this register */ status = acpi_hw_gpe_write(0xFF, &gpe_block->register_info[i].status_address); if (ACPI_FAILURE(status)) { return (status); } } return (AE_OK); } /****************************************************************************** * * FUNCTION: acpi_hw_enable_runtime_gpe_block * * PARAMETERS: gpe_xrupt_info - GPE Interrupt info * gpe_block - Gpe Block info * * RETURN: Status * * DESCRIPTION: Enable all "runtime" GPEs within a single GPE block. Includes * combination wake/run GPEs. * ******************************************************************************/ acpi_status acpi_hw_enable_runtime_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, struct acpi_gpe_block_info *gpe_block, void *context) { u32 i; acpi_status status; struct acpi_gpe_register_info *gpe_register_info; u8 enable_mask; /* NOTE: assumes that all GPEs are currently disabled */ /* Examine each GPE Register within the block */ for (i = 0; i < gpe_block->register_count; i++) { gpe_register_info = &gpe_block->register_info[i]; if (!gpe_register_info->enable_for_run) { continue; } /* Enable all "runtime" GPEs in this register */ enable_mask = gpe_register_info->enable_for_run & ~gpe_register_info->mask_for_run; status = acpi_hw_gpe_enable_write(enable_mask, gpe_register_info); if (ACPI_FAILURE(status)) { return (status); } } return (AE_OK); } /****************************************************************************** * * FUNCTION: acpi_hw_enable_wakeup_gpe_block * * PARAMETERS: gpe_xrupt_info - GPE Interrupt info * gpe_block - Gpe Block info * * RETURN: Status * * DESCRIPTION: Enable all "wake" GPEs within a single GPE block. Includes * combination wake/run GPEs. * ******************************************************************************/ static acpi_status acpi_hw_enable_wakeup_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, struct acpi_gpe_block_info *gpe_block, void *context) { u32 i; acpi_status status; struct acpi_gpe_register_info *gpe_register_info; /* Examine each GPE Register within the block */ for (i = 0; i < gpe_block->register_count; i++) { gpe_register_info = &gpe_block->register_info[i]; /* * Enable all "wake" GPEs in this register and disable the * remaining ones. */ status = acpi_hw_gpe_enable_write(gpe_register_info->enable_for_wake, gpe_register_info); if (ACPI_FAILURE(status)) { return (status); } } return (AE_OK); } struct acpi_gpe_block_status_context { struct acpi_gpe_register_info *gpe_skip_register_info; u8 gpe_skip_mask; u8 retval; }; /****************************************************************************** * * FUNCTION: acpi_hw_get_gpe_block_status * * PARAMETERS: gpe_xrupt_info - GPE Interrupt info * gpe_block - Gpe Block info * context - GPE list walk context data * * RETURN: Success * * DESCRIPTION: Produce a combined GPE status bits mask for the given block. * ******************************************************************************/ static acpi_status acpi_hw_get_gpe_block_status(struct acpi_gpe_xrupt_info *gpe_xrupt_info, struct acpi_gpe_block_info *gpe_block, void *context) { struct acpi_gpe_block_status_context *c = context; struct acpi_gpe_register_info *gpe_register_info; u64 in_enable, in_status; acpi_status status; u8 ret_mask; u32 i; /* Examine each GPE Register within the block */ for (i = 0; i < gpe_block->register_count; i++) { gpe_register_info = &gpe_block->register_info[i]; status = acpi_hw_gpe_read(&in_enable, &gpe_register_info->enable_address); if (ACPI_FAILURE(status)) { continue; } status = acpi_hw_gpe_read(&in_status, &gpe_register_info->status_address); if (ACPI_FAILURE(status)) { continue; } ret_mask = in_enable & in_status; if (ret_mask && c->gpe_skip_register_info == gpe_register_info) { ret_mask &= ~c->gpe_skip_mask; } c->retval |= ret_mask; } return (AE_OK); } /****************************************************************************** * * FUNCTION: acpi_hw_disable_all_gpes * * PARAMETERS: None * * RETURN: Status * * DESCRIPTION: Disable and clear all GPEs in all GPE blocks * ******************************************************************************/ acpi_status acpi_hw_disable_all_gpes(void) { acpi_status status; ACPI_FUNCTION_TRACE(hw_disable_all_gpes); status = acpi_ev_walk_gpe_list(acpi_hw_disable_gpe_block, NULL); return_ACPI_STATUS(status); } /****************************************************************************** * * FUNCTION: acpi_hw_enable_all_runtime_gpes * * PARAMETERS: None * * RETURN: Status * * DESCRIPTION: Enable all "runtime" GPEs, in all GPE blocks * ******************************************************************************/ acpi_status acpi_hw_enable_all_runtime_gpes(void) { acpi_status status; ACPI_FUNCTION_TRACE(hw_enable_all_runtime_gpes); status = acpi_ev_walk_gpe_list(acpi_hw_enable_runtime_gpe_block, NULL); return_ACPI_STATUS(status); } /****************************************************************************** * * FUNCTION: acpi_hw_enable_all_wakeup_gpes * * PARAMETERS: None * * RETURN: Status * * DESCRIPTION: Enable all "wakeup" GPEs, in all GPE blocks * ******************************************************************************/ acpi_status acpi_hw_enable_all_wakeup_gpes(void) { acpi_status status; ACPI_FUNCTION_TRACE(hw_enable_all_wakeup_gpes); status = acpi_ev_walk_gpe_list(acpi_hw_enable_wakeup_gpe_block, NULL); return_ACPI_STATUS(status); } /****************************************************************************** * * FUNCTION: acpi_hw_check_all_gpes * * PARAMETERS: gpe_skip_device - GPE devoce of the GPE to skip * gpe_skip_number - Number of the GPE to skip * * RETURN: Combined status of all GPEs * * DESCRIPTION: Check all enabled GPEs in all GPE blocks, except for the one * represented by the "skip" arguments, and return TRUE if the * status bit is set for at least one of them of FALSE otherwise. * ******************************************************************************/ u8 acpi_hw_check_all_gpes(acpi_handle gpe_skip_device, u32 gpe_skip_number) { struct acpi_gpe_block_status_context context = { .gpe_skip_register_info = NULL, .retval = 0, }; struct acpi_gpe_event_info *gpe_event_info; acpi_cpu_flags flags; ACPI_FUNCTION_TRACE(acpi_hw_check_all_gpes); flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); gpe_event_info = acpi_ev_get_gpe_event_info(gpe_skip_device, gpe_skip_number); if (gpe_event_info) { context.gpe_skip_register_info = gpe_event_info->register_info; context.gpe_skip_mask = acpi_hw_get_gpe_register_bit(gpe_event_info); } acpi_os_release_lock(acpi_gbl_gpe_lock, flags); (void)acpi_ev_walk_gpe_list(acpi_hw_get_gpe_block_status, &context); return (context.retval != 0); } #endif /* !ACPI_REDUCED_HARDWARE */
116 117 117 5 5 5 5 602 625 671 671 652 15 15 37 37 37 24 24 15 15 15 15 112 110 111 113 113 514 367 310 310 310 286 285 285 310 239 59 234 233 4 4 4 4 3 3 3 4 6 6 5 5 5 5 6 5 6 3 6 3 3 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 9 10 9 6 1 1 1 4 4 5 9 10 4 1 3 1 1 1 1 1 4 34 32 1 31 1 30 22 7 8 3 2 4 3 6 1 4 4 1 6 1 3 24 1 23 1 22 22 22 34 4 3 3 1 2 3 4 30 19 19 5 1 19 30 15 15 15 15 14 15 14 14 14 14 14 14 11 14 14 14 15 92 93 93 93 93 93 11 82 10 11 11 1 11 11 11 10 11 11 82 82 82 82 80 80 82 2 2 2 2 2 2 2 82 82 81 82 82 82 82 82 82 82 29 1 5 5 5 5 1 4 5 6 5 5 5 5 486 486 208 112 172 207 82 82 80 5 82 980 155 983 942 27 11 27 127 58 58 3 59 14 983 36 125 486 486 1 1 13 13 13 13 13 486 486 486 486 486 486 13 13 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: FIB frontend. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/flow.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/rtnetlink.h> #include <net/xfrm.h> #include <net/l3mdev.h> #include <net/lwtunnel.h> #include <trace/events/fib.h> #ifndef CONFIG_IP_MULTIPLE_TABLES static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; main_table = fib_trie_table(RT_TABLE_MAIN, NULL); if (!main_table) return -ENOMEM; local_table = fib_trie_table(RT_TABLE_LOCAL, main_table); if (!local_table) goto fail; hlist_add_head_rcu(&local_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]); hlist_add_head_rcu(&main_table->tb_hlist, &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]); return 0; fail: fib_free_table(main_table); return -ENOMEM; } #else struct fib_table *fib_new_table(struct net *net, u32 id) { struct fib_table *tb, *alias = NULL; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; tb = fib_get_table(net, id); if (tb) return tb; if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules) alias = fib_new_table(net, RT_TABLE_MAIN); tb = fib_trie_table(id, alias); if (!tb) return NULL; switch (id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, tb); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, tb); break; default: break; } h = id & (FIB_TABLE_HASHSZ - 1); hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; } EXPORT_SYMBOL_GPL(fib_new_table); /* caller must hold either rtnl or rcu read lock */ struct fib_table *fib_get_table(struct net *net, u32 id) { struct fib_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT_TABLE_MAIN; h = id & (FIB_TABLE_HASHSZ - 1); head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist, lockdep_rtnl_is_held()) { if (tb->tb_id == id) return tb; } return NULL; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, struct fib_table *new) { #ifdef CONFIG_IP_MULTIPLE_TABLES switch (new->tb_id) { case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, new); break; case RT_TABLE_DEFAULT: rcu_assign_pointer(net->ipv4.fib_default, new); break; default: break; } #endif /* replace the old table in the hlist */ hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist); } int fib_unmerge(struct net *net) { struct fib_table *old, *new, *main_table; /* attempt to fetch local table if it has been allocated */ old = fib_get_table(net, RT_TABLE_LOCAL); if (!old) return 0; new = fib_trie_unmerge(old); if (!new) return -ENOMEM; /* table is already unmerged */ if (new == old) return 0; /* replace merged table with clean table */ fib_replace_table(net, old, new); fib_free_table(old); /* attempt to fetch main table if it has been allocated */ main_table = fib_get_table(net, RT_TABLE_MAIN); if (!main_table) return 0; /* flush local entries from main table */ fib_table_flush_external(main_table); return 0; } void fib_flush(struct net *net) { int flushed = 0; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) flushed += fib_table_flush(net, tb, false); } if (flushed) rt_cache_flush(net); } /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr, u32 tb_id) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned int ret = RTN_BROADCAST; struct fib_table *table; if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) return RTN_BROADCAST; if (ipv4_is_multicast(addr)) return RTN_MULTICAST; rcu_read_lock(); table = fib_get_table(net, tb_id); if (table) { ret = RTN_UNICAST; if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0); if (!dev || dev == nhc->nhc_dev) ret = res.type; } } rcu_read_unlock(); return ret; } unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id) { return __inet_dev_addr_type(net, NULL, addr, tb_id); } EXPORT_SYMBOL(inet_addr_type_table); unsigned int inet_addr_type(struct net *net, __be32 addr) { return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL); } EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, dev, addr, rt_table); } EXPORT_SYMBOL(inet_dev_addr_type); /* inet_addr_type with dev == NULL but using the table from a dev * if one is associated */ unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr) { u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, NULL, addr, rt_table); } EXPORT_SYMBOL(inet_addr_type_dev_table); __be32 fib_compute_spec_dst(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; struct rtable *rt; struct net *net; int scope; rt = skb_rtable(skb); if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == RTCF_LOCAL) return ip_hdr(skb)->daddr; in_dev = __in_dev_get_rcu(dev); net = dev_net(dev); scope = RT_SCOPE_UNIVERSE; if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev); struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_dscp = ip4h_dscp(ip_hdr(skb)), .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; if (!fib_lookup(net, &fl4, &res, 0)) return fib_result_prefsrc(net, &res); } else { scope = RT_SCOPE_LINK; } return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); } bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev) { bool dev_match = false; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (unlikely(fi->nh)) { dev_match = nexthop_uses_dev(fi->nh, dev); } else { int ret; for (ret = 0; ret < fib_info_num_path(fi); ret++) { const struct fib_nh_common *nhc = fib_info_nhc(fi, ret); if (nhc_l3mdev_matches_dev(nhc, dev)) { dev_match = true; break; } } } #else if (fib_info_nhc(fi, 0)->nhc_dev == dev) dev_match = true; #endif return dev_match; } EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev); /* Given (packet source, input interface) and optional (dst, oif, tos): * - (main) check, that source is valid i.e. not broadcast or our local * address. * - figure out what "logical" interface this packet arrived * and calculate "specific destination" address. * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, int rpf, struct in_device *idev, u32 *itag) { struct net *net = dev_net(dev); enum skb_drop_reason reason; struct flow_keys flkeys; int ret, no_addr; struct fib_result res; struct flowi4 fl4; bool dev_match; fl4.flowi4_oif = 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev); fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_dscp = dscp; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_multipath_hash = 0; no_addr = idev->ifa_list == NULL; fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) { fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; } else { swap(fl4.fl4_sport, fl4.fl4_dport); } if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; if (res.type != RTN_UNICAST) { if (res.type != RTN_LOCAL) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto e_inval; } else if (!IN_DEV_ACCEPT_LOCAL(idev)) { reason = SKB_DROP_REASON_IP_LOCAL_SOURCE; goto e_inval; } } fib_combine_itag(itag, &res); dev_match = fib_info_nh_uses_dev(res.fi, dev); /* This is not common, loopback packets retain skb_dst so normally they * would not even hit this slow path. */ dev_match = dev_match || (res.type == RTN_LOCAL && dev == net->loopback_dev); if (dev_match) { ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; return ret; } if (no_addr) goto last_resort; if (rpf == 1) goto e_rpf; fl4.flowi4_oif = dev->ifindex; ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; } return ret; last_resort: if (rpf) goto e_rpf; *itag = 0; return 0; e_inval: return -reason; e_rpf: return -SKB_DROP_REASON_IP_RPFILTER; } /* Ignore rp_filter for packets protected by IPsec. */ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); struct net *net = dev_net(dev); if (!r && !fib_num_tclassid_users(net) && (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { if (IN_DEV_ACCEPT_LOCAL(idev)) goto ok; /* with custom local routes in place, checking local addresses * only will be too optimistic, with custom rules, checking * local addresses only can be too strict, e.g. due to vrf */ if (net->ipv4.fib_has_custom_local_routes || fib4_has_custom_rules(net)) goto full_check; /* Within the same container, it is regarded as a martian source, * and the same host but different containers are not. */ if (inet_lookup_ifaddr_rcu(net, src)) return -SKB_DROP_REASON_IP_LOCAL_SOURCE; ok: *itag = 0; return 0; } full_check: return __fib_validate_source(skb, src, dst, dscp, oif, dev, r, idev, itag); } static inline __be32 sk_extract_addr(struct sockaddr *addr) { return ((struct sockaddr_in *) addr)->sin_addr.s_addr; } static int put_rtax(struct nlattr *mx, int len, int type, u32 value) { struct nlattr *nla; nla = (struct nlattr *) ((char *) mx + len); nla->nla_type = type; nla->nla_len = nla_attr_size(4); *(u32 *) nla_data(nla) = value; return len + nla_total_size(4); } static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, struct fib_config *cfg) { __be32 addr; int plen; memset(cfg, 0, sizeof(*cfg)); cfg->fc_nlinfo.nl_net = net; if (rt->rt_dst.sa_family != AF_INET) return -EAFNOSUPPORT; /* * Check mask for validity: * a) it must be contiguous. * b) destination must have all host bits clear. * c) if application forgot to set correct family (AF_INET), * reject request unless it is absolutely clear i.e. * both family and mask are zero. */ plen = 32; addr = sk_extract_addr(&rt->rt_dst); if (!(rt->rt_flags & RTF_HOST)) { __be32 mask = sk_extract_addr(&rt->rt_genmask); if (rt->rt_genmask.sa_family != AF_INET) { if (mask || rt->rt_genmask.sa_family) return -EAFNOSUPPORT; } if (bad_mask(mask, addr)) return -EINVAL; plen = inet_mask_len(mask); } cfg->fc_dst_len = plen; cfg->fc_dst = addr; if (cmd != SIOCDELRT) { cfg->fc_nlflags = NLM_F_CREATE; cfg->fc_protocol = RTPROT_BOOT; } if (rt->rt_metric) cfg->fc_priority = rt->rt_metric - 1; if (rt->rt_flags & RTF_REJECT) { cfg->fc_scope = RT_SCOPE_HOST; cfg->fc_type = RTN_UNREACHABLE; return 0; } cfg->fc_scope = RT_SCOPE_NOWHERE; cfg->fc_type = RTN_UNICAST; if (rt->rt_dev) { char *colon; struct net_device *dev; char devname[IFNAMSIZ]; if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1)) return -EFAULT; devname[IFNAMSIZ-1] = 0; colon = strchr(devname, ':'); if (colon) *colon = 0; dev = __dev_get_by_name(net, devname); if (!dev) return -ENODEV; cfg->fc_oif = dev->ifindex; cfg->fc_table = l3mdev_fib_table(dev); if (colon) { const struct in_ifaddr *ifa; struct in_device *in_dev; in_dev = __in_dev_get_rtnl_net(dev); if (!in_dev) return -ENODEV; *colon = ':'; in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; } if (!ifa) return -ENODEV; cfg->fc_prefsrc = ifa->ifa_local; } } addr = sk_extract_addr(&rt->rt_gateway); if (rt->rt_gateway.sa_family == AF_INET && addr) { unsigned int addr_type; cfg->fc_gw4 = addr; cfg->fc_gw_family = AF_INET; addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && addr_type == RTN_UNICAST) cfg->fc_scope = RT_SCOPE_UNIVERSE; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; if (cmd == SIOCDELRT) return 0; if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family) return -EINVAL; if (cfg->fc_scope == RT_SCOPE_NOWHERE) cfg->fc_scope = RT_SCOPE_LINK; if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) { struct nlattr *mx; int len = 0; mx = kcalloc(3, nla_total_size(4), GFP_KERNEL); if (!mx) return -ENOMEM; if (rt->rt_flags & RTF_MTU) len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40); if (rt->rt_flags & RTF_WINDOW) len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window); if (rt->rt_flags & RTF_IRTT) len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3); cfg->fc_mx = mx; cfg->fc_mx_len = len; } return 0; } /* * Handle IP routing ioctl calls. * These are used to manipulate the routing tables */ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) { struct fib_config cfg; int err; switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_net_lock(net); err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) err = fib_table_delete(net, tb, &cfg, NULL); else err = -ESRCH; } else { tb = fib_new_table(net, cfg.fc_table); if (tb) err = fib_table_insert(net, tb, &cfg, NULL); else err = -ENOBUFS; } /* allocated by rtentry_to_fib_config() */ kfree(cfg.fc_mx); } rtnl_net_unlock(net); return err; } return -EINVAL; } const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_GATEWAY] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_PREFSRC] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_FLOW] = { .type = NLA_U32 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, }; int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, struct netlink_ext_ack *extack) { struct rtvia *via; int alen; if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) { NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA"); return -EINVAL; } via = nla_data(nla); alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr); switch (via->rtvia_family) { case AF_INET: if (alen != sizeof(__be32)) { NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET; cfg->fc_gw4 = *((__be32 *)via->rtvia_addr); break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) if (alen != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA"); return -EINVAL; } cfg->fc_gw_family = AF_INET6; cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr); #else NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); return -EINVAL; #endif break; default: NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA"); return -EINVAL; } return 0; } static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct fib_config *cfg, struct netlink_ext_ack *extack) { bool has_gw = false, has_via = false; struct nlattr *attr; int err, remaining; struct rtmsg *rtm; err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) goto errout; memset(cfg, 0, sizeof(*cfg)); rtm = nlmsg_data(nlh); if (!inet_validate_dscp(rtm->rtm_tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); err = -EINVAL; goto errout; } cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos); cfg->fc_dst_len = rtm->rtm_dst_len; cfg->fc_table = rtm->rtm_table; cfg->fc_protocol = rtm->rtm_protocol; cfg->fc_scope = rtm->rtm_scope; cfg->fc_type = rtm->rtm_type; cfg->fc_flags = rtm->rtm_flags; cfg->fc_nlflags = nlh->nlmsg_flags; cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = net; if (cfg->fc_type > RTN_MAX) { NL_SET_ERR_MSG(extack, "Invalid route type"); err = -EINVAL; goto errout; } nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) { switch (nla_type(attr)) { case RTA_DST: cfg->fc_dst = nla_get_be32(attr); break; case RTA_OIF: cfg->fc_oif = nla_get_u32(attr); break; case RTA_GATEWAY: has_gw = true; cfg->fc_gw4 = nla_get_be32(attr); if (cfg->fc_gw4) cfg->fc_gw_family = AF_INET; break; case RTA_VIA: has_via = true; err = fib_gw_from_via(cfg, attr, extack); if (err) goto errout; break; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; case RTA_PREFSRC: cfg->fc_prefsrc = nla_get_be32(attr); break; case RTA_METRICS: cfg->fc_mx = nla_data(attr); cfg->fc_mx_len = nla_len(attr); break; case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), nla_len(attr), extack); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); cfg->fc_mp_len = nla_len(attr); break; case RTA_FLOW: cfg->fc_flow = nla_get_u32(attr); break; case RTA_TABLE: cfg->fc_table = nla_get_u32(attr); break; case RTA_ENCAP: cfg->fc_encap = attr; break; case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); if (err < 0) goto errout; break; case RTA_NH_ID: cfg->fc_nh_id = nla_get_u32(attr); break; } } if (cfg->fc_dst_len > 32) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); err = -EINVAL; goto errout; } if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) { NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); err = -EINVAL; goto errout; } if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); err = -EINVAL; goto errout; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); err = -EINVAL; goto errout; } if (!cfg->fc_table) cfg->fc_table = RT_TABLE_MAIN; return 0; errout: return err; } static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; rtnl_net_lock(net); if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); err = -EINVAL; goto unlock; } tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; goto unlock; } err = fib_table_delete(net, tb, &cfg, extack); unlock: rtnl_net_unlock(net); errout: return err; } static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; struct fib_table *tb; int err; err = rtm_to_fib_config(net, skb, nlh, &cfg, extack); if (err < 0) goto errout; rtnl_net_lock(net); tb = fib_new_table(net, cfg.fc_table); if (!tb) { err = -ENOBUFS; goto unlock; } err = fib_table_insert(net, tb, &cfg, extack); if (!err && cfg.fc_type == RTN_LOCAL) net->ipv4.fib_has_custom_local_routes = true; unlock: rtnl_net_unlock(net); errout: return err; } int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[RTA_MAX + 1]; struct rtmsg *rtm; int err, i; if (filter->rtnl_held) ASSERT_RTNL(); rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); return -EINVAL; } if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_scope) { NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) { NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request"); return -EINVAL; } if (rtm->rtm_flags & RTM_F_CLONED) filter->dump_routes = false; else filter->dump_exceptions = false; filter->flags = rtm->rtm_flags; filter->protocol = rtm->rtm_protocol; filter->rt_type = rtm->rtm_type; filter->table_id = rtm->rtm_table; err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err < 0) return err; for (i = 0; i <= RTA_MAX; ++i) { int ifindex; if (!tb[i]) continue; switch (i) { case RTA_TABLE: filter->table_id = nla_get_u32(tb[i]); break; case RTA_OIF: ifindex = nla_get_u32(tb[i]); if (filter->rtnl_held) filter->dev = __dev_get_by_index(net, ifindex); else filter->dev = dev_get_by_index_rcu(net, ifindex); if (!filter->dev) return -ENODEV; break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; } } if (filter->flags || filter->protocol || filter->rt_type || filter->table_id || filter->dev) { filter->filter_set = 1; cb->answer_flags = NLM_F_DUMP_FILTERED; } return 0; } EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req); static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct fib_dump_filter filter = { .dump_routes = true, .dump_exceptions = true, .rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib_table *tb; struct hlist_head *head; int dumped = 0, err = 0; rcu_read_lock(); if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &filter, cb); if (err < 0) goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED); } /* ipv4 does not use prefix flag */ if (filter.flags & RTM_F_PREFIX) goto unlock; if (filter.table_id) { tb = fib_get_table(net, filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET) goto unlock; NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist"); err = -ENOENT; goto unlock; } err = fib_table_dump(tb, skb, cb, &filter); goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; err = 0; for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist) { if (e < s_e) goto next; if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); err = fib_table_dump(tb, skb, cb, &filter); if (err < 0) goto out; dumped = 1; next: e++; } } out: cb->args[1] = e; cb->args[0] = h; unlock: rcu_read_unlock(); return err; } /* Prepare and feed intra-kernel routing request. * Really, it should be netlink message, but :-( netlink * can be not configured, so that we feed it directly * to fib engine. It is legal, because all events occur * only when netlink is already locked. */ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa, u32 rt_priority) { struct net *net = dev_net(ifa->ifa_dev->dev); u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, .fc_type = type, .fc_dst = dst, .fc_dst_len = dst_len, .fc_priority = rt_priority, .fc_prefsrc = ifa->ifa_local, .fc_oif = ifa->ifa_dev->dev->ifindex, .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND, .fc_nlinfo = { .nl_net = net, }, }; if (!tb_id) tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL; tb = fib_new_table(net, tb_id); if (!tb) return; cfg.fc_table = tb->tb_id; if (type != RTN_LOCAL) cfg.fc_scope = RT_SCOPE_LINK; else cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) fib_table_insert(net, tb, &cfg, NULL); else fib_table_delete(net, tb, &cfg, NULL); } void fib_add_ifaddr(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *prim = ifa; __be32 mask = ifa->ifa_mask; __be32 addr = ifa->ifa_local; __be32 prefix = ifa->ifa_address & mask; if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); if (!prim) { pr_warn("%s: bug: prim == NULL\n", __func__); return; } } fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0); if (!(dev->flags & IFF_UP)) return; /* Add broadcast address, if it is explicitly assigned. */ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); arp_invalidate(dev, ifa->ifa_broadcast, false); } if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim, ifa->ifa_rt_priority); /* Add the network broadcast address, when it makes sense */ if (ifa->ifa_prefixlen < 31) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); arp_invalidate(dev, prefix | ~mask, false); } } } void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric) { __be32 prefix = ifa->ifa_address & ifa->ifa_mask; struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; if (!(dev->flags & IFF_UP) || ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) || ipv4_is_zeronet(prefix) || (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32)) return; /* add the new */ fib_magic(RTM_NEWROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, new_metric); /* delete the old */ fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority); } /* Delete primary or secondary address. * Optionally, on secondary address promotion consider the addresses * from subnet iprim as deleted, even if they are in device list. * In this case the secondary ifa can be in device list. */ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; struct in_ifaddr *prim = ifa, *prim1 = NULL; __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 #define BRD_OK 2 #define BRD0_OK 4 #define BRD1_OK 8 unsigned int ok = 0; int subnet = 0; /* Primary network */ int gone = 1; /* Address is missing */ int same_prefsrc = 0; /* Another primary with same IP */ if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (!prim) { /* if the device has been deleted, we don't perform * address promotion */ if (!in_dev->dead) pr_warn("%s: bug: prim == NULL\n", __func__); return; } if (iprim && iprim != prim) { pr_warn("%s: bug: iprim != prim\n", __func__); return; } } else if (!ipv4_is_zeronet(any) && (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE)) fib_magic(RTM_DELROUTE, dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, any, ifa->ifa_prefixlen, prim, 0); subnet = 1; } if (in_dev->dead) goto no_promotions; /* Deletion is more complicated than add. * We should take care of not to delete too much :-) * * Scan address list to be sure that addresses are really gone. */ rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa1, in_dev) { if (ifa1 == ifa) { /* promotion, keep the IP */ gone = 0; continue; } /* Ignore IFAs from our subnet */ if (iprim && ifa1->ifa_mask == iprim->ifa_mask && inet_ifa_match(ifa1->ifa_address, iprim)) continue; /* Ignore ifa1 if it uses different primary IP (prefsrc) */ if (ifa1->ifa_flags & IFA_F_SECONDARY) { /* Another address from our subnet? */ if (ifa1->ifa_mask == prim->ifa_mask && inet_ifa_match(ifa1->ifa_address, prim)) prim1 = prim; else { /* We reached the secondaries, so * same_prefsrc should be determined. */ if (!same_prefsrc) continue; /* Search new prim1 if ifa1 is not * using the current prim1 */ if (!prim1 || ifa1->ifa_mask != prim1->ifa_mask || !inet_ifa_match(ifa1->ifa_address, prim1)) prim1 = inet_ifa_byprefix(in_dev, ifa1->ifa_address, ifa1->ifa_mask); if (!prim1) continue; if (prim1->ifa_local != prim->ifa_local) continue; } } else { if (prim->ifa_local != ifa1->ifa_local) continue; prim1 = ifa1; if (prim != prim1) same_prefsrc = 1; } if (ifa->ifa_local == ifa1->ifa_local) ok |= LOCAL_OK; if (ifa->ifa_broadcast == ifa1->ifa_broadcast) ok |= BRD_OK; if (brd == ifa1->ifa_broadcast) ok |= BRD1_OK; if (any == ifa1->ifa_broadcast) ok |= BRD0_OK; /* primary has network specific broadcasts */ if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; if (!ipv4_is_zeronet(any1)) { if (ifa->ifa_broadcast == brd1 || ifa->ifa_broadcast == any1) ok |= BRD_OK; if (brd == brd1 || brd == any1) ok |= BRD1_OK; if (any == brd1 || any == any1) ok |= BRD0_OK; } } } rcu_read_unlock(); no_promotions: if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); if (subnet && ifa->ifa_prefixlen < 31) { if (!(ok & BRD1_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim, 0); if (!(ok & BRD0_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim, 0); } if (!(ok & LOCAL_OK)) { unsigned int addr_type; fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0); /* Check, that this local address finally disappeared. */ addr_type = inet_addr_type_dev_table(dev_net(dev), dev, ifa->ifa_local); if (gone && addr_type != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * * First of all, we scan fib_info list searching * for stray nexthop entries, then ignite fib_flush. */ if (fib_sync_down_addr(dev, ifa->ifa_local)) fib_flush(dev_net(dev)); } } #undef LOCAL_OK #undef BRD_OK #undef BRD0_OK #undef BRD1_OK } static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) { struct fib_result res; struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, .flowi4_dscp = inet_dsfield_to_dscp(frn->fl_tos), .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; rcu_read_lock(); tb = fib_get_table(net, frn->tb_id_in); frn->err = -ENOENT; if (tb) { local_bh_disable(); frn->tb_id = tb->tb_id; frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); if (!frn->err) { frn->prefixlen = res.prefixlen; frn->nh_sel = res.nh_sel; frn->type = res.type; frn->scope = res.scope; } local_bh_enable(); } rcu_read_unlock(); } static void nl_fib_input(struct sk_buff *skb) { struct net *net; struct fib_result_nl *frn; struct nlmsghdr *nlh; u32 portid; net = sock_net(skb->sk); nlh = nlmsg_hdr(skb); if (skb->len < nlmsg_total_size(sizeof(*frn)) || skb->len < nlh->nlmsg_len || nlmsg_len(nlh) < sizeof(*frn)) return; skb = netlink_skb_clone(skb, GFP_KERNEL); if (!skb) return; nlh = nlmsg_hdr(skb); frn = nlmsg_data(nlh); nl_fib_lookup(net, frn); portid = NETLINK_CB(skb).portid; /* netlink portid */ NETLINK_CB(skb).portid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ nlmsg_unicast(net->ipv4.fibnl, skb, portid); } static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; struct netlink_kernel_cfg cfg = { .input = nl_fib_input, }; sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg); if (!sk) return -EAFNOSUPPORT; net->ipv4.fibnl = sk; return 0; } static void nl_fib_lookup_exit(struct net *net) { netlink_kernel_release(net->ipv4.fibnl); net->ipv4.fibnl = NULL; } static void fib_disable_ip(struct net_device *dev, unsigned long event, bool force) { if (fib_sync_down_dev(dev, event, force)) fib_flush(dev_net(dev)); else rt_cache_flush(dev_net(dev)); arp_ifdown(dev); } static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = ptr; struct net_device *dev = ifa->ifa_dev->dev; struct net *net = dev_net(dev); switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); atomic_inc(&net->ipv4.dev_addr_genid); if (!ifa->ifa_dev->ifa_list) { /* Last address was deleted from this interface. * Disable IP. */ fib_disable_ip(dev, event, true); } else { rt_cache_flush(net); } break; } return NOTIFY_DONE; } static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_changeupper_info *upper_info = ptr; struct netdev_notifier_info_ext *info_ext = ptr; struct in_device *in_dev; struct net *net = dev_net(dev); struct in_ifaddr *ifa; unsigned int flags; if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, event, true); rt_flush_dev(dev); return NOTIFY_DONE; } in_dev = __in_dev_get_rtnl(dev); if (!in_dev) return NOTIFY_DONE; switch (event) { case NETDEV_UP: in_dev_for_each_ifa_rtnl(ifa, in_dev) { fib_add_ifaddr(ifa); } #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); rt_cache_flush(net); break; case NETDEV_DOWN: fib_disable_ip(dev, event, false); break; case NETDEV_CHANGE: flags = netif_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) fib_sync_up(dev, RTNH_F_LINKDOWN); else fib_sync_down_dev(dev, event, false); rt_cache_flush(net); break; case NETDEV_CHANGEMTU: fib_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(net); break; case NETDEV_CHANGEUPPER: upper_info = ptr; /* flush all routes if dev is linked to or unlinked from * an L3 master device (e.g., VRF) */ if (upper_info->upper_dev && netif_is_l3_master(upper_info->upper_dev)) fib_disable_ip(dev, NETDEV_DOWN, true); break; } return NOTIFY_DONE; } static struct notifier_block fib_inetaddr_notifier = { .notifier_call = fib_inetaddr_event, }; static struct notifier_block fib_netdev_notifier = { .notifier_call = fib_netdev_event, }; static int __net_init ip_fib_net_init(struct net *net) { int err; size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; err = fib4_notifier_init(net); if (err) return err; #ifdef CONFIG_IP_ROUTE_MULTIPATH /* Default to 3-tuple */ net->ipv4.sysctl_fib_multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; #endif /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv4.fib_table_hash) { err = -ENOMEM; goto err_table_hash_alloc; } err = fib4_rules_init(net); if (err < 0) goto err_rules_init; return 0; err_rules_init: kfree(net->ipv4.fib_table_hash); err_table_hash_alloc: fib4_notifier_exit(net); return err; } static void ip_fib_net_exit(struct net *net) { int i; ASSERT_RTNL_NET(net); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif /* Destroy the tables in reverse order to guarantee that the * local table, ID 255, is destroyed before the main table, ID * 254. This is necessary as the local table may contain * references to data contained in the main table. */ for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) { struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); fib_table_flush(net, tb, true); fib_free_table(tb); } } #ifdef CONFIG_IP_MULTIPLE_TABLES fib4_rules_exit(net); #endif kfree(net->ipv4.fib_table_hash); fib4_notifier_exit(net); } static int __net_init fib_net_init(struct net *net) { int error; #ifdef CONFIG_IP_ROUTE_CLASSID atomic_set(&net->ipv4.fib_num_tclassid_users, 0); #endif error = ip_fib_net_init(net); if (error < 0) goto out; error = fib4_semantics_init(net); if (error) goto out_semantics; error = nl_fib_lookup_init(net); if (error < 0) goto out_nlfl; error = fib_proc_init(net); if (error < 0) goto out_proc; out: return error; out_proc: nl_fib_lookup_exit(net); out_nlfl: fib4_semantics_exit(net); out_semantics: rtnl_net_lock(net); ip_fib_net_exit(net); rtnl_net_unlock(net); goto out; } static void __net_exit fib_net_exit(struct net *net) { fib_proc_exit(net); nl_fib_lookup_exit(net); } static void __net_exit fib_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { __rtnl_net_lock(net); ip_fib_net_exit(net); __rtnl_net_unlock(net); } rtnl_unlock(); list_for_each_entry(net, net_list, exit_list) fib4_semantics_exit(net); } static struct pernet_operations fib_net_ops = { .init = fib_net_init, .exit = fib_net_exit, .exit_batch = fib_net_exit_batch, }; static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWROUTE, .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELROUTE, .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, }; void __init ip_fib_init(void) { fib_trie_init(); register_pernet_subsys(&fib_net_ops); register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); rtnl_register_many(fib_rtnl_msg_handlers); }
359 358 362 109 8 360 8 8 7 356 359 275 276 2 2 2 2 2 2 2 565 4 561 206 205 64 583 152 57 97 413 412 412 413 44 44 142 141 47 47 99 152 57 152 97 56 55 351 351 349 347 10 271 270 271 269 8 8 8 8 1 8 1 99 7 92 93 92 91 90 1 88 8 1 89 22 93 93 98 198 198 195 71 14 14 14 10 14 1 1 14 112 112 87 112 112 20 112 14 112 112 71 112 112 10 102 112 271 269 84 271 152 152 99 55 97 275 275 274 275 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/errno.h> #include <linux/kmod.h> #include <linux/sched.h> #include <linux/interrupt.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/device.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/seq_file.h> #include <linux/uaccess.h> #include <linux/ratelimit.h> #include "tty.h" #undef LDISC_DEBUG_HANGUP #ifdef LDISC_DEBUG_HANGUP #define tty_ldisc_debug(tty, f, args...) tty_debug(tty, f, ##args) #else #define tty_ldisc_debug(tty, f, args...) #endif /* lockdep nested classes for tty->ldisc_sem */ enum { LDISC_SEM_NORMAL, LDISC_SEM_OTHER, }; /* * This guards the refcounted line discipline lists. The lock * must be taken with irqs off because there are hangup path * callers who will do ldisc lookups and cannot sleep. */ static DEFINE_RAW_SPINLOCK(tty_ldiscs_lock); /* Line disc dispatch table */ static struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS]; /** * tty_register_ldisc - install a line discipline * @new_ldisc: pointer to the ldisc object * * Installs a new line discipline into the kernel. The discipline is set up as * unreferenced and then made available to the kernel from this point onwards. * * Locking: takes %tty_ldiscs_lock to guard against ldisc races */ int tty_register_ldisc(struct tty_ldisc_ops *new_ldisc) { unsigned long flags; if (new_ldisc->num < N_TTY || new_ldisc->num >= NR_LDISCS) return -EINVAL; raw_spin_lock_irqsave(&tty_ldiscs_lock, flags); tty_ldiscs[new_ldisc->num] = new_ldisc; raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags); return 0; } EXPORT_SYMBOL(tty_register_ldisc); /** * tty_unregister_ldisc - unload a line discipline * @ldisc: ldisc number * * Remove a line discipline from the kernel providing it is not currently in * use. * * Locking: takes %tty_ldiscs_lock to guard against ldisc races */ void tty_unregister_ldisc(struct tty_ldisc_ops *ldisc) { unsigned long flags; raw_spin_lock_irqsave(&tty_ldiscs_lock, flags); tty_ldiscs[ldisc->num] = NULL; raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags); } EXPORT_SYMBOL(tty_unregister_ldisc); static struct tty_ldisc_ops *get_ldops(int disc) { unsigned long flags; struct tty_ldisc_ops *ldops, *ret; raw_spin_lock_irqsave(&tty_ldiscs_lock, flags); ret = ERR_PTR(-EINVAL); ldops = tty_ldiscs[disc]; if (ldops) { ret = ERR_PTR(-EAGAIN); if (try_module_get(ldops->owner)) ret = ldops; } raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags); return ret; } static void put_ldops(struct tty_ldisc_ops *ldops) { unsigned long flags; raw_spin_lock_irqsave(&tty_ldiscs_lock, flags); module_put(ldops->owner); raw_spin_unlock_irqrestore(&tty_ldiscs_lock, flags); } int tty_ldisc_autoload = IS_BUILTIN(CONFIG_LDISC_AUTOLOAD); /** * tty_ldisc_get - take a reference to an ldisc * @tty: tty device * @disc: ldisc number * * Takes a reference to a line discipline. Deals with refcounts and module * locking counts. If the discipline is not available, its module loaded, if * possible. * * Returns: * * -%EINVAL if the discipline index is not [%N_TTY .. %NR_LDISCS] or if the * discipline is not registered * * -%EAGAIN if request_module() failed to load or register the discipline * * -%ENOMEM if allocation failure * * Otherwise, returns a pointer to the discipline and bumps the ref count * * Locking: takes %tty_ldiscs_lock to guard against ldisc races */ static struct tty_ldisc *tty_ldisc_get(struct tty_struct *tty, int disc) { struct tty_ldisc *ld; struct tty_ldisc_ops *ldops; if (disc < N_TTY || disc >= NR_LDISCS) return ERR_PTR(-EINVAL); /* * Get the ldisc ops - we may need to request them to be loaded * dynamically and try again. */ ldops = get_ldops(disc); if (IS_ERR(ldops)) { if (!capable(CAP_SYS_MODULE) && !tty_ldisc_autoload) return ERR_PTR(-EPERM); request_module("tty-ldisc-%d", disc); ldops = get_ldops(disc); if (IS_ERR(ldops)) return ERR_CAST(ldops); } /* * There is no way to handle allocation failure of only 16 bytes. * Let's simplify error handling and save more memory. */ ld = kmalloc(sizeof(struct tty_ldisc), GFP_KERNEL | __GFP_NOFAIL); ld->ops = ldops; ld->tty = tty; return ld; } /** * tty_ldisc_put - release the ldisc * @ld: lisdsc to release * * Complement of tty_ldisc_get(). */ static void tty_ldisc_put(struct tty_ldisc *ld) { if (WARN_ON_ONCE(!ld)) return; put_ldops(ld->ops); kfree(ld); } static void *tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos) { return (*pos < NR_LDISCS) ? pos : NULL; } static void *tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return (*pos < NR_LDISCS) ? pos : NULL; } static void tty_ldiscs_seq_stop(struct seq_file *m, void *v) { } static int tty_ldiscs_seq_show(struct seq_file *m, void *v) { int i = *(loff_t *)v; struct tty_ldisc_ops *ldops; ldops = get_ldops(i); if (IS_ERR(ldops)) return 0; seq_printf(m, "%-10s %2d\n", ldops->name ? ldops->name : "???", i); put_ldops(ldops); return 0; } const struct seq_operations tty_ldiscs_seq_ops = { .start = tty_ldiscs_seq_start, .next = tty_ldiscs_seq_next, .stop = tty_ldiscs_seq_stop, .show = tty_ldiscs_seq_show, }; /** * tty_ldisc_ref_wait - wait for the tty ldisc * @tty: tty device * * Dereference the line discipline for the terminal and take a reference to it. * If the line discipline is in flux then wait patiently until it changes. * * Returns: %NULL if the tty has been hungup and not re-opened with a new file * descriptor, otherwise valid ldisc reference * * Note 1: Must not be called from an IRQ/timer context. The caller must also * be careful not to hold other locks that will deadlock against a discipline * change, such as an existing ldisc reference (which we check for). * * Note 2: a file_operations routine (read/poll/write) should use this function * to wait for any ldisc lifetime events to finish. */ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty) { struct tty_ldisc *ld; ldsem_down_read(&tty->ldisc_sem, MAX_SCHEDULE_TIMEOUT); ld = tty->ldisc; if (!ld) ldsem_up_read(&tty->ldisc_sem); return ld; } EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait); /** * tty_ldisc_ref - get the tty ldisc * @tty: tty device * * Dereference the line discipline for the terminal and take a reference to it. * If the line discipline is in flux then return %NULL. Can be called from IRQ * and timer functions. */ struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty) { struct tty_ldisc *ld = NULL; if (ldsem_down_read_trylock(&tty->ldisc_sem)) { ld = tty->ldisc; if (!ld) ldsem_up_read(&tty->ldisc_sem); } return ld; } EXPORT_SYMBOL_GPL(tty_ldisc_ref); /** * tty_ldisc_deref - free a tty ldisc reference * @ld: reference to free up * * Undoes the effect of tty_ldisc_ref() or tty_ldisc_ref_wait(). May be called * in IRQ context. */ void tty_ldisc_deref(struct tty_ldisc *ld) { ldsem_up_read(&ld->tty->ldisc_sem); } EXPORT_SYMBOL_GPL(tty_ldisc_deref); static inline int __tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout) { return ldsem_down_write(&tty->ldisc_sem, timeout); } static inline int __tty_ldisc_lock_nested(struct tty_struct *tty, unsigned long timeout) { return ldsem_down_write_nested(&tty->ldisc_sem, LDISC_SEM_OTHER, timeout); } static inline void __tty_ldisc_unlock(struct tty_struct *tty) { ldsem_up_write(&tty->ldisc_sem); } int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout) { int ret; /* Kindly asking blocked readers to release the read side */ set_bit(TTY_LDISC_CHANGING, &tty->flags); wake_up_interruptible_all(&tty->read_wait); wake_up_interruptible_all(&tty->write_wait); ret = __tty_ldisc_lock(tty, timeout); if (!ret) return -EBUSY; set_bit(TTY_LDISC_HALTED, &tty->flags); return 0; } void tty_ldisc_unlock(struct tty_struct *tty) { clear_bit(TTY_LDISC_HALTED, &tty->flags); /* Can be cleared here - ldisc_unlock will wake up writers firstly */ clear_bit(TTY_LDISC_CHANGING, &tty->flags); __tty_ldisc_unlock(tty); } static int tty_ldisc_lock_pair_timeout(struct tty_struct *tty, struct tty_struct *tty2, unsigned long timeout) { int ret; if (tty < tty2) { ret = __tty_ldisc_lock(tty, timeout); if (ret) { ret = __tty_ldisc_lock_nested(tty2, timeout); if (!ret) __tty_ldisc_unlock(tty); } } else { /* if this is possible, it has lots of implications */ WARN_ON_ONCE(tty == tty2); if (tty2 && tty != tty2) { ret = __tty_ldisc_lock(tty2, timeout); if (ret) { ret = __tty_ldisc_lock_nested(tty, timeout); if (!ret) __tty_ldisc_unlock(tty2); } } else ret = __tty_ldisc_lock(tty, timeout); } if (!ret) return -EBUSY; set_bit(TTY_LDISC_HALTED, &tty->flags); if (tty2) set_bit(TTY_LDISC_HALTED, &tty2->flags); return 0; } static void tty_ldisc_lock_pair(struct tty_struct *tty, struct tty_struct *tty2) { tty_ldisc_lock_pair_timeout(tty, tty2, MAX_SCHEDULE_TIMEOUT); } static void tty_ldisc_unlock_pair(struct tty_struct *tty, struct tty_struct *tty2) { __tty_ldisc_unlock(tty); if (tty2) __tty_ldisc_unlock(tty2); } /** * tty_ldisc_flush - flush line discipline queue * @tty: tty to flush ldisc for * * Flush the line discipline queue (if any) and the tty flip buffers for this * @tty. */ void tty_ldisc_flush(struct tty_struct *tty) { struct tty_ldisc *ld = tty_ldisc_ref(tty); tty_buffer_flush(tty, ld); if (ld) tty_ldisc_deref(ld); } EXPORT_SYMBOL_GPL(tty_ldisc_flush); /** * tty_set_termios_ldisc - set ldisc field * @tty: tty structure * @disc: line discipline number * * This is probably overkill for real world processors but they are not on hot * paths so a little discipline won't do any harm. * * The line discipline-related tty_struct fields are reset to prevent the ldisc * driver from re-using stale information for the new ldisc instance. * * Locking: takes termios_rwsem */ static void tty_set_termios_ldisc(struct tty_struct *tty, int disc) { down_write(&tty->termios_rwsem); tty->termios.c_line = disc; up_write(&tty->termios_rwsem); tty->disc_data = NULL; tty->receive_room = 0; } /** * tty_ldisc_open - open a line discipline * @tty: tty we are opening the ldisc on * @ld: discipline to open * * A helper opening method. Also a convenient debugging and check point. * * Locking: always called with BTM already held. */ static int tty_ldisc_open(struct tty_struct *tty, struct tty_ldisc *ld) { WARN_ON(test_and_set_bit(TTY_LDISC_OPEN, &tty->flags)); if (ld->ops->open) { int ret; /* BTM here locks versus a hangup event */ ret = ld->ops->open(tty); if (ret) clear_bit(TTY_LDISC_OPEN, &tty->flags); tty_ldisc_debug(tty, "%p: opened\n", ld); return ret; } return 0; } /** * tty_ldisc_close - close a line discipline * @tty: tty we are opening the ldisc on * @ld: discipline to close * * A helper close method. Also a convenient debugging and check point. */ static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld) { lockdep_assert_held_write(&tty->ldisc_sem); WARN_ON(!test_bit(TTY_LDISC_OPEN, &tty->flags)); clear_bit(TTY_LDISC_OPEN, &tty->flags); if (ld->ops->close) ld->ops->close(tty); tty_ldisc_debug(tty, "%p: closed\n", ld); } /** * tty_ldisc_failto - helper for ldisc failback * @tty: tty to open the ldisc on * @ld: ldisc we are trying to fail back to * * Helper to try and recover a tty when switching back to the old ldisc fails * and we need something attached. */ static int tty_ldisc_failto(struct tty_struct *tty, int ld) { struct tty_ldisc *disc = tty_ldisc_get(tty, ld); int r; lockdep_assert_held_write(&tty->ldisc_sem); if (IS_ERR(disc)) return PTR_ERR(disc); tty->ldisc = disc; tty_set_termios_ldisc(tty, ld); r = tty_ldisc_open(tty, disc); if (r < 0) tty_ldisc_put(disc); return r; } /** * tty_ldisc_restore - helper for tty ldisc change * @tty: tty to recover * @old: previous ldisc * * Restore the previous line discipline or %N_TTY when a line discipline change * fails due to an open error */ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old) { /* There is an outstanding reference here so this is safe */ if (tty_ldisc_failto(tty, old->ops->num) < 0) { const char *name = tty_name(tty); pr_warn("Falling back ldisc for %s.\n", name); /* * The traditional behaviour is to fall back to N_TTY, we * want to avoid falling back to N_NULL unless we have no * choice to avoid the risk of breaking anything */ if (tty_ldisc_failto(tty, N_TTY) < 0 && tty_ldisc_failto(tty, N_NULL) < 0) panic("Couldn't open N_NULL ldisc for %s.", name); } } /** * tty_set_ldisc - set line discipline * @tty: the terminal to set * @disc: the line discipline number * * Set the discipline of a tty line. Must be called from a process context. The * ldisc change logic has to protect itself against any overlapping ldisc * change (including on the other end of pty pairs), the close of one side of a * tty/pty pair, and eventually hangup. */ int tty_set_ldisc(struct tty_struct *tty, int disc) { int retval; struct tty_ldisc *old_ldisc, *new_ldisc; new_ldisc = tty_ldisc_get(tty, disc); if (IS_ERR(new_ldisc)) return PTR_ERR(new_ldisc); tty_lock(tty); retval = tty_ldisc_lock(tty, 5 * HZ); if (retval) goto err; if (!tty->ldisc) { retval = -EIO; goto out; } /* Check the no-op case */ if (tty->ldisc->ops->num == disc) goto out; if (test_bit(TTY_HUPPED, &tty->flags)) { /* We were raced by hangup */ retval = -EIO; goto out; } if (tty->ops->ldisc_ok) { retval = tty->ops->ldisc_ok(tty, disc); if (retval) goto out; } old_ldisc = tty->ldisc; /* Shutdown the old discipline. */ tty_ldisc_close(tty, old_ldisc); /* Now set up the new line discipline. */ tty->ldisc = new_ldisc; tty_set_termios_ldisc(tty, disc); retval = tty_ldisc_open(tty, new_ldisc); if (retval < 0) { /* Back to the old one or N_TTY if we can't */ tty_ldisc_put(new_ldisc); tty_ldisc_restore(tty, old_ldisc); } if (tty->ldisc->ops->num != old_ldisc->ops->num && tty->ops->set_ldisc) { down_read(&tty->termios_rwsem); tty->ops->set_ldisc(tty); up_read(&tty->termios_rwsem); } /* * At this point we hold a reference to the new ldisc and a * reference to the old ldisc, or we hold two references to * the old ldisc (if it was restored as part of error cleanup * above). In either case, releasing a single reference from * the old ldisc is correct. */ new_ldisc = old_ldisc; out: tty_ldisc_unlock(tty); /* * Restart the work queue in case no characters kick it off. Safe if * already running */ tty_buffer_restart_work(tty->port); err: tty_ldisc_put(new_ldisc); /* drop the extra reference */ tty_unlock(tty); return retval; } EXPORT_SYMBOL_GPL(tty_set_ldisc); /** * tty_ldisc_kill - teardown ldisc * @tty: tty being released * * Perform final close of the ldisc and reset @tty->ldisc */ static void tty_ldisc_kill(struct tty_struct *tty) { lockdep_assert_held_write(&tty->ldisc_sem); if (!tty->ldisc) return; /* * Now kill off the ldisc */ tty_ldisc_close(tty, tty->ldisc); tty_ldisc_put(tty->ldisc); /* Force an oops if we mess this up */ tty->ldisc = NULL; } /** * tty_reset_termios - reset terminal state * @tty: tty to reset * * Restore a terminal to the driver default state. */ static void tty_reset_termios(struct tty_struct *tty) { down_write(&tty->termios_rwsem); tty->termios = tty->driver->init_termios; tty->termios.c_ispeed = tty_termios_input_baud_rate(&tty->termios); tty->termios.c_ospeed = tty_termios_baud_rate(&tty->termios); up_write(&tty->termios_rwsem); } /** * tty_ldisc_reinit - reinitialise the tty ldisc * @tty: tty to reinit * @disc: line discipline to reinitialize * * Completely reinitialize the line discipline state, by closing the current * instance, if there is one, and opening a new instance. If an error occurs * opening the new non-%N_TTY instance, the instance is dropped and @tty->ldisc * reset to %NULL. The caller can then retry with %N_TTY instead. * * Returns: 0 if successful, otherwise error code < 0 */ int tty_ldisc_reinit(struct tty_struct *tty, int disc) { struct tty_ldisc *ld; int retval; lockdep_assert_held_write(&tty->ldisc_sem); ld = tty_ldisc_get(tty, disc); if (IS_ERR(ld)) { BUG_ON(disc == N_TTY); return PTR_ERR(ld); } if (tty->ldisc) { tty_ldisc_close(tty, tty->ldisc); tty_ldisc_put(tty->ldisc); } /* switch the line discipline */ tty->ldisc = ld; tty_set_termios_ldisc(tty, disc); retval = tty_ldisc_open(tty, tty->ldisc); if (retval) { tty_ldisc_put(tty->ldisc); tty->ldisc = NULL; } return retval; } /** * tty_ldisc_hangup - hangup ldisc reset * @tty: tty being hung up * @reinit: whether to re-initialise the tty * * Some tty devices reset their termios when they receive a hangup event. In * that situation we must also switch back to %N_TTY properly before we reset * the termios data. * * Locking: We can take the ldisc mutex as the rest of the code is careful to * allow for this. * * In the pty pair case this occurs in the close() path of the tty itself so we * must be careful about locking rules. */ void tty_ldisc_hangup(struct tty_struct *tty, bool reinit) { struct tty_ldisc *ld; tty_ldisc_debug(tty, "%p: hangup\n", tty->ldisc); ld = tty_ldisc_ref(tty); if (ld != NULL) { if (ld->ops->flush_buffer) ld->ops->flush_buffer(tty); tty_driver_flush_buffer(tty); if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) && ld->ops->write_wakeup) ld->ops->write_wakeup(tty); if (ld->ops->hangup) ld->ops->hangup(tty); tty_ldisc_deref(ld); } wake_up_interruptible_poll(&tty->write_wait, EPOLLOUT); wake_up_interruptible_poll(&tty->read_wait, EPOLLIN); /* * Shutdown the current line discipline, and reset it to * N_TTY if need be. * * Avoid racing set_ldisc or tty_ldisc_release */ tty_ldisc_lock(tty, MAX_SCHEDULE_TIMEOUT); if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) tty_reset_termios(tty); if (tty->ldisc) { if (reinit) { if (tty_ldisc_reinit(tty, tty->termios.c_line) < 0 && tty_ldisc_reinit(tty, N_TTY) < 0) WARN_ON(tty_ldisc_reinit(tty, N_NULL) < 0); } else tty_ldisc_kill(tty); } tty_ldisc_unlock(tty); } /** * tty_ldisc_setup - open line discipline * @tty: tty being shut down * @o_tty: pair tty for pty/tty pairs * * Called during the initial open of a tty/pty pair in order to set up the line * disciplines and bind them to the @tty. This has no locking issues as the * device isn't yet active. */ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty) { int retval = tty_ldisc_open(tty, tty->ldisc); if (retval) return retval; if (o_tty) { /* * Called without o_tty->ldisc_sem held, as o_tty has been * just allocated and no one has a reference to it. */ retval = tty_ldisc_open(o_tty, o_tty->ldisc); if (retval) { tty_ldisc_close(tty, tty->ldisc); return retval; } } return 0; } /** * tty_ldisc_release - release line discipline * @tty: tty being shut down (or one end of pty pair) * * Called during the final close of a tty or a pty pair in order to shut down * the line discpline layer. On exit, each tty's ldisc is %NULL. */ void tty_ldisc_release(struct tty_struct *tty) { struct tty_struct *o_tty = tty->link; /* * Shutdown this line discipline. As this is the final close, * it does not race with the set_ldisc code path. */ tty_ldisc_lock_pair(tty, o_tty); tty_ldisc_kill(tty); if (o_tty) tty_ldisc_kill(o_tty); tty_ldisc_unlock_pair(tty, o_tty); /* * And the memory resources remaining (buffers, termios) will be * disposed of when the kref hits zero */ tty_ldisc_debug(tty, "released\n"); } /** * tty_ldisc_init - ldisc setup for new tty * @tty: tty being allocated * * Set up the line discipline objects for a newly allocated tty. Note that the * tty structure is not completely set up when this call is made. */ int tty_ldisc_init(struct tty_struct *tty) { struct tty_ldisc *ld = tty_ldisc_get(tty, N_TTY); if (IS_ERR(ld)) return PTR_ERR(ld); tty->ldisc = ld; return 0; } /** * tty_ldisc_deinit - ldisc cleanup for new tty * @tty: tty that was allocated recently * * The tty structure must not be completely set up (tty_ldisc_setup()) when * this call is made. */ void tty_ldisc_deinit(struct tty_struct *tty) { /* no ldisc_sem, tty is being destroyed */ if (tty->ldisc) tty_ldisc_put(tty->ldisc); tty->ldisc = NULL; }
8 9 9 9 9 9 9 9 8 9 8 8 8 8 6 9 9 8 6 6 6 6 9 8 8 8 9 9 9 9 27 27 27 27 2 27 26 26 27 27 26 27 27 27 27 27 27 27 19 19 10 10 19 27 26 26 26 27 27 27 25 1 1 1 27 27 27 27 26 26 27 27 27 27 26 26 26 26 26 26 27 27 27 26 27 27 27 27 27 27 27 27 27 27 27 27 27 26 27 27 27 27 27 3 2 3 27 71 71 71 71 71 71 71 71 99 9 9 9 100 9 9 9 9 9 9 9 9 8 6 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/condition.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* List of "struct tomoyo_condition". */ LIST_HEAD(tomoyo_condition_list); /** * tomoyo_argv - Check argv[] in "struct linux_binbrm". * * @index: Index number of @arg_ptr. * @arg_ptr: Contents of argv[@index]. * @argc: Length of @argv. * @argv: Pointer to "struct tomoyo_argv". * @checked: Set to true if @argv[@index] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_argv(const unsigned int index, const char *arg_ptr, const int argc, const struct tomoyo_argv *argv, u8 *checked) { int i; struct tomoyo_path_info arg; arg.name = arg_ptr; for (i = 0; i < argc; argv++, checked++, i++) { bool result; if (index != argv->index) continue; *checked = 1; tomoyo_fill_path_info(&arg); result = tomoyo_path_matches_pattern(&arg, argv->value); if (argv->is_not) result = !result; if (!result) return false; } return true; } /** * tomoyo_envp - Check envp[] in "struct linux_binbrm". * * @env_name: The name of environment variable. * @env_value: The value of environment variable. * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * @checked: Set to true if @envp[@env_name] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_envp(const char *env_name, const char *env_value, const int envc, const struct tomoyo_envp *envp, u8 *checked) { int i; struct tomoyo_path_info name; struct tomoyo_path_info value; name.name = env_name; tomoyo_fill_path_info(&name); value.name = env_value; tomoyo_fill_path_info(&value); for (i = 0; i < envc; envp++, checked++, i++) { bool result; if (!tomoyo_path_matches_pattern(&name, envp->name)) continue; *checked = 1; if (envp->value) { result = tomoyo_path_matches_pattern(&value, envp->value); if (envp->is_not) result = !result; } else { result = true; if (!envp->is_not) result = !result; } if (!result) return false; } return true; } /** * tomoyo_scan_bprm - Scan "struct linux_binprm". * * @ee: Pointer to "struct tomoyo_execve". * @argc: Length of @argc. * @argv: Pointer to "struct tomoyo_argv". * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee, const u16 argc, const struct tomoyo_argv *argv, const u16 envc, const struct tomoyo_envp *envp) { struct linux_binprm *bprm = ee->bprm; struct tomoyo_page_dump *dump = &ee->dump; char *arg_ptr = ee->tmp; int arg_len = 0; unsigned long pos = bprm->p; int offset = pos % PAGE_SIZE; int argv_count = bprm->argc; int envp_count = bprm->envc; bool result = true; u8 local_checked[32]; u8 *checked; if (argc + envc <= sizeof(local_checked)) { checked = local_checked; memset(local_checked, 0, sizeof(local_checked)); } else { checked = kzalloc(argc + envc, GFP_NOFS); if (!checked) return false; } while (argv_count || envp_count) { if (!tomoyo_dump_page(bprm, pos, dump)) { result = false; goto out; } pos += PAGE_SIZE - offset; while (offset < PAGE_SIZE) { /* Read. */ const char *kaddr = dump->data; const unsigned char c = kaddr[offset++]; if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) { if (c == '\\') { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = '\\'; } else if (c > ' ' && c < 127) { arg_ptr[arg_len++] = c; } else { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = (c >> 6) + '0'; arg_ptr[arg_len++] = ((c >> 3) & 7) + '0'; arg_ptr[arg_len++] = (c & 7) + '0'; } } else { arg_ptr[arg_len] = '\0'; } if (c) continue; /* Check. */ if (argv_count) { if (!tomoyo_argv(bprm->argc - argv_count, arg_ptr, argc, argv, checked)) { result = false; break; } argv_count--; } else if (envp_count) { char *cp = strchr(arg_ptr, '='); if (cp) { *cp = '\0'; if (!tomoyo_envp(arg_ptr, cp + 1, envc, envp, checked + argc)) { result = false; break; } } envp_count--; } else { break; } arg_len = 0; } offset = 0; if (!result) break; } out: if (result) { int i; /* Check not-yet-checked entries. */ for (i = 0; i < argc; i++) { if (checked[i]) continue; /* * Return true only if all unchecked indexes in * bprm->argv[] are not matched. */ if (argv[i].is_not) continue; result = false; break; } for (i = 0; i < envc; envp++, i++) { if (checked[argc + i]) continue; /* * Return true only if all unchecked environ variables * in bprm->envp[] are either undefined or not matched. */ if ((!envp->value && !envp->is_not) || (envp->value && envp->is_not)) continue; result = false; break; } } if (checked != local_checked) kfree(checked); return result; } /** * tomoyo_scan_exec_realpath - Check "exec.realpath" parameter of "struct tomoyo_condition". * * @file: Pointer to "struct file". * @ptr: Pointer to "struct tomoyo_name_union". * @match: True if "exec.realpath=", false if "exec.realpath!=". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_exec_realpath(struct file *file, const struct tomoyo_name_union *ptr, const bool match) { bool result; struct tomoyo_path_info exe; if (!file) return false; exe.name = tomoyo_realpath_from_path(&file->f_path); if (!exe.name) return false; tomoyo_fill_path_info(&exe); result = tomoyo_compare_name_union(&exe, ptr); kfree(exe.name); return result == match; } /** * tomoyo_get_dqword - tomoyo_get_name() for a quoted string. * * @start: String to save. * * Returns pointer to "struct tomoyo_path_info" on success, NULL otherwise. */ static const struct tomoyo_path_info *tomoyo_get_dqword(char *start) { char *cp = start + strlen(start) - 1; if (cp == start || *start++ != '"' || *cp != '"') return NULL; *cp = '\0'; if (*start && !tomoyo_correct_word(start)) return NULL; return tomoyo_get_name(start); } /** * tomoyo_parse_name_union_quoted - Parse a quoted word. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_name_union_quoted(struct tomoyo_acl_param *param, struct tomoyo_name_union *ptr) { char *filename = param->data; if (*filename == '@') return tomoyo_parse_name_union(param, ptr); ptr->filename = tomoyo_get_dqword(filename); return ptr->filename != NULL; } /** * tomoyo_parse_argv - Parse an argv[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @argv: Pointer to "struct tomoyo_argv". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_argv(char *left, char *right, struct tomoyo_argv *argv) { if (tomoyo_parse_ulong(&argv->index, &left) != TOMOYO_VALUE_TYPE_DECIMAL || *left++ != ']' || *left) return false; argv->value = tomoyo_get_dqword(right); return argv->value != NULL; } /** * tomoyo_parse_envp - Parse an envp[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_envp(char *left, char *right, struct tomoyo_envp *envp) { const struct tomoyo_path_info *name; const struct tomoyo_path_info *value; char *cp = left + strlen(left) - 1; if (*cp-- != ']' || *cp != '"') goto out; *cp = '\0'; if (!tomoyo_correct_word(left)) goto out; name = tomoyo_get_name(left); if (!name) goto out; if (!strcmp(right, "NULL")) { value = NULL; } else { value = tomoyo_get_dqword(right); if (!value) { tomoyo_put_name(name); goto out; } } envp->name = name; envp->value = value; return true; out: return false; } /** * tomoyo_same_condition - Check for duplicated "struct tomoyo_condition" entry. * * @a: Pointer to "struct tomoyo_condition". * @b: Pointer to "struct tomoyo_condition". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_condition(const struct tomoyo_condition *a, const struct tomoyo_condition *b) { return a->size == b->size && a->condc == b->condc && a->numbers_count == b->numbers_count && a->names_count == b->names_count && a->argc == b->argc && a->envc == b->envc && a->grant_log == b->grant_log && a->transit == b->transit && !memcmp(a + 1, b + 1, a->size - sizeof(*a)); } /** * tomoyo_condition_type - Get condition type. * * @word: Keyword string. * * Returns one of values in "enum tomoyo_conditions_index" on success, * TOMOYO_MAX_CONDITION_KEYWORD otherwise. */ static u8 tomoyo_condition_type(const char *word) { u8 i; for (i = 0; i < TOMOYO_MAX_CONDITION_KEYWORD; i++) { if (!strcmp(word, tomoyo_condition_keyword[i])) break; } return i; } /* Define this to enable debug mode. */ /* #define DEBUG_CONDITION */ #ifdef DEBUG_CONDITION #define dprintk printk #else #define dprintk(...) do { } while (0) #endif /** * tomoyo_commit_condition - Commit "struct tomoyo_condition". * * @entry: Pointer to "struct tomoyo_condition". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. * * This function merges duplicated entries. This function returns NULL if * @entry is not duplicated but memory quota for policy has exceeded. */ static struct tomoyo_condition *tomoyo_commit_condition (struct tomoyo_condition *entry) { struct tomoyo_condition *ptr; bool found = false; if (mutex_lock_interruptible(&tomoyo_policy_lock)) { dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); ptr = NULL; found = true; goto out; } list_for_each_entry(ptr, &tomoyo_condition_list, head.list) { if (!tomoyo_same_condition(ptr, entry) || atomic_read(&ptr->head.users) == TOMOYO_GC_IN_PROGRESS) continue; /* Same entry found. Share this entry. */ atomic_inc(&ptr->head.users); found = true; break; } if (!found) { if (tomoyo_memory_ok(entry)) { atomic_set(&entry->head.users, 1); list_add(&entry->head.list, &tomoyo_condition_list); } else { found = true; ptr = NULL; } } mutex_unlock(&tomoyo_policy_lock); out: if (found) { tomoyo_del_condition(&entry->head.list); kfree(entry); entry = ptr; } return entry; } /** * tomoyo_get_transit_preference - Parse domain transition preference for execve(). * * @param: Pointer to "struct tomoyo_acl_param". * @e: Pointer to "struct tomoyo_condition". * * Returns the condition string part. */ static char *tomoyo_get_transit_preference(struct tomoyo_acl_param *param, struct tomoyo_condition *e) { char * const pos = param->data; bool flag; if (*pos == '<') { e->transit = tomoyo_get_domainname(param); goto done; } { char *cp = strchr(pos, ' '); if (cp) *cp = '\0'; flag = tomoyo_correct_path(pos) || !strcmp(pos, "keep") || !strcmp(pos, "initialize") || !strcmp(pos, "reset") || !strcmp(pos, "child") || !strcmp(pos, "parent"); if (cp) *cp = ' '; } if (!flag) return pos; e->transit = tomoyo_get_name(tomoyo_read_token(param)); done: if (e->transit) return param->data; /* * Return a bad read-only condition string that will let * tomoyo_get_condition() return NULL. */ return "/"; } /** * tomoyo_get_condition - Parse condition part. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. */ struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param) { struct tomoyo_condition *entry = NULL; struct tomoyo_condition_element *condp = NULL; struct tomoyo_number_union *numbers_p = NULL; struct tomoyo_name_union *names_p = NULL; struct tomoyo_argv *argv = NULL; struct tomoyo_envp *envp = NULL; struct tomoyo_condition e = { }; char * const start_of_string = tomoyo_get_transit_preference(param, &e); char * const end_of_string = start_of_string + strlen(start_of_string); char *pos; rerun: pos = start_of_string; while (1) { u8 left = -1; u8 right = -1; char *left_word = pos; char *cp; char *right_word; bool is_not; if (!*left_word) break; /* * Since left-hand condition does not allow use of "path_group" * or "number_group" and environment variable's names do not * accept '=', it is guaranteed that the original line consists * of one or more repetition of $left$operator$right blocks * where "$left is free from '=' and ' '" and "$operator is * either '=' or '!='" and "$right is free from ' '". * Therefore, we can reconstruct the original line at the end * of dry run even if we overwrite $operator with '\0'. */ cp = strchr(pos, ' '); if (cp) { *cp = '\0'; /* Will restore later. */ pos = cp + 1; } else { pos = ""; } right_word = strchr(left_word, '='); if (!right_word || right_word == left_word) goto out; is_not = *(right_word - 1) == '!'; if (is_not) *(right_word++ - 1) = '\0'; /* Will restore later. */ else if (*(right_word + 1) != '=') *right_word++ = '\0'; /* Will restore later. */ else goto out; dprintk(KERN_WARNING "%u: <%s>%s=<%s>\n", __LINE__, left_word, is_not ? "!" : "", right_word); if (!strcmp(left_word, "grant_log")) { if (entry) { if (is_not || entry->grant_log != TOMOYO_GRANTLOG_AUTO) goto out; else if (!strcmp(right_word, "yes")) entry->grant_log = TOMOYO_GRANTLOG_YES; else if (!strcmp(right_word, "no")) entry->grant_log = TOMOYO_GRANTLOG_NO; else goto out; } continue; } if (!strncmp(left_word, "exec.argv[", 10)) { if (!argv) { e.argc++; e.condc++; } else { e.argc--; e.condc--; left = TOMOYO_ARGV_ENTRY; argv->is_not = is_not; if (!tomoyo_parse_argv(left_word + 10, right_word, argv++)) goto out; } goto store_value; } if (!strncmp(left_word, "exec.envp[\"", 11)) { if (!envp) { e.envc++; e.condc++; } else { e.envc--; e.condc--; left = TOMOYO_ENVP_ENTRY; envp->is_not = is_not; if (!tomoyo_parse_envp(left_word + 11, right_word, envp++)) goto out; } goto store_value; } left = tomoyo_condition_type(left_word); dprintk(KERN_WARNING "%u: <%s> left=%u\n", __LINE__, left_word, left); if (left == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; left = TOMOYO_NUMBER_UNION; param->data = left_word; if (*left_word == '@' || !tomoyo_parse_number_union(param, numbers_p++)) goto out; } } if (!condp) e.condc++; else e.condc--; if (left == TOMOYO_EXEC_REALPATH || left == TOMOYO_SYMLINK_TARGET) { if (!names_p) { e.names_count++; } else { e.names_count--; right = TOMOYO_NAME_UNION; param->data = right_word; if (!tomoyo_parse_name_union_quoted(param, names_p++)) goto out; } goto store_value; } right = tomoyo_condition_type(right_word); if (right == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; right = TOMOYO_NUMBER_UNION; param->data = right_word; if (!tomoyo_parse_number_union(param, numbers_p++)) goto out; } } store_value: if (!condp) { dprintk(KERN_WARNING "%u: dry_run left=%u right=%u match=%u\n", __LINE__, left, right, !is_not); continue; } condp->left = left; condp->right = right; condp->equals = !is_not; dprintk(KERN_WARNING "%u: left=%u right=%u match=%u\n", __LINE__, condp->left, condp->right, condp->equals); condp++; } dprintk(KERN_INFO "%u: cond=%u numbers=%u names=%u ac=%u ec=%u\n", __LINE__, e.condc, e.numbers_count, e.names_count, e.argc, e.envc); if (entry) { BUG_ON(e.names_count | e.numbers_count | e.argc | e.envc | e.condc); return tomoyo_commit_condition(entry); } e.size = sizeof(*entry) + e.condc * sizeof(struct tomoyo_condition_element) + e.numbers_count * sizeof(struct tomoyo_number_union) + e.names_count * sizeof(struct tomoyo_name_union) + e.argc * sizeof(struct tomoyo_argv) + e.envc * sizeof(struct tomoyo_envp); entry = kzalloc(e.size, GFP_NOFS); if (!entry) goto out2; *entry = e; e.transit = NULL; condp = (struct tomoyo_condition_element *) (entry + 1); numbers_p = (struct tomoyo_number_union *) (condp + e.condc); names_p = (struct tomoyo_name_union *) (numbers_p + e.numbers_count); argv = (struct tomoyo_argv *) (names_p + e.names_count); envp = (struct tomoyo_envp *) (argv + e.argc); { bool flag = false; for (pos = start_of_string; pos < end_of_string; pos++) { if (*pos) continue; if (flag) /* Restore " ". */ *pos = ' '; else if (*(pos + 1) == '=') /* Restore "!=". */ *pos = '!'; else /* Restore "=". */ *pos = '='; flag = !flag; } } goto rerun; out: dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); if (entry) { tomoyo_del_condition(&entry->head.list); kfree(entry); } out2: tomoyo_put_name(e.transit); return NULL; } /** * tomoyo_get_attributes - Revalidate "struct inode". * * @obj: Pointer to "struct tomoyo_obj_info". * * Returns nothing. */ void tomoyo_get_attributes(struct tomoyo_obj_info *obj) { u8 i; struct dentry *dentry = NULL; for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) { struct inode *inode; switch (i) { case TOMOYO_PATH1: dentry = obj->path1.dentry; if (!dentry) continue; break; case TOMOYO_PATH2: dentry = obj->path2.dentry; if (!dentry) continue; break; default: if (!dentry) continue; dentry = dget_parent(dentry); break; } inode = d_backing_inode(dentry); if (inode) { struct tomoyo_mini_stat *stat = &obj->stat[i]; stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->dev = inode->i_sb->s_dev; stat->rdev = inode->i_rdev; obj->stat_valid[i] = true; } if (i & 1) /* TOMOYO_PATH1_PARENT or TOMOYO_PATH2_PARENT */ dput(dentry); } } /** * tomoyo_condition - Check condition part. * * @r: Pointer to "struct tomoyo_request_info". * @cond: Pointer to "struct tomoyo_condition". Maybe NULL. * * Returns true on success, false otherwise. * * Caller holds tomoyo_read_lock(). */ bool tomoyo_condition(struct tomoyo_request_info *r, const struct tomoyo_condition *cond) { u32 i; unsigned long min_v[2] = { 0, 0 }; unsigned long max_v[2] = { 0, 0 }; const struct tomoyo_condition_element *condp; const struct tomoyo_number_union *numbers_p; const struct tomoyo_name_union *names_p; const struct tomoyo_argv *argv; const struct tomoyo_envp *envp; struct tomoyo_obj_info *obj; u16 condc; u16 argc; u16 envc; struct linux_binprm *bprm = NULL; if (!cond) return true; condc = cond->condc; argc = cond->argc; envc = cond->envc; obj = r->obj; if (r->ee) bprm = r->ee->bprm; if (!bprm && (argc || envc)) return false; condp = (struct tomoyo_condition_element *) (cond + 1); numbers_p = (const struct tomoyo_number_union *) (condp + condc); names_p = (const struct tomoyo_name_union *) (numbers_p + cond->numbers_count); argv = (const struct tomoyo_argv *) (names_p + cond->names_count); envp = (const struct tomoyo_envp *) (argv + argc); for (i = 0; i < condc; i++) { const bool match = condp->equals; const u8 left = condp->left; const u8 right = condp->right; bool is_bitop[2] = { false, false }; u8 j; condp++; /* Check argv[] and envp[] later. */ if (left == TOMOYO_ARGV_ENTRY || left == TOMOYO_ENVP_ENTRY) continue; /* Check string expressions. */ if (right == TOMOYO_NAME_UNION) { const struct tomoyo_name_union *ptr = names_p++; struct tomoyo_path_info *symlink; struct tomoyo_execve *ee; struct file *file; switch (left) { case TOMOYO_SYMLINK_TARGET: symlink = obj ? obj->symlink_target : NULL; if (!symlink || !tomoyo_compare_name_union(symlink, ptr) == match) goto out; break; case TOMOYO_EXEC_REALPATH: ee = r->ee; file = ee ? ee->bprm->file : NULL; if (!tomoyo_scan_exec_realpath(file, ptr, match)) goto out; break; } continue; } /* Check numeric or bit-op expressions. */ for (j = 0; j < 2; j++) { const u8 index = j ? right : left; unsigned long value = 0; switch (index) { case TOMOYO_TASK_UID: value = from_kuid(&init_user_ns, current_uid()); break; case TOMOYO_TASK_EUID: value = from_kuid(&init_user_ns, current_euid()); break; case TOMOYO_TASK_SUID: value = from_kuid(&init_user_ns, current_suid()); break; case TOMOYO_TASK_FSUID: value = from_kuid(&init_user_ns, current_fsuid()); break; case TOMOYO_TASK_GID: value = from_kgid(&init_user_ns, current_gid()); break; case TOMOYO_TASK_EGID: value = from_kgid(&init_user_ns, current_egid()); break; case TOMOYO_TASK_SGID: value = from_kgid(&init_user_ns, current_sgid()); break; case TOMOYO_TASK_FSGID: value = from_kgid(&init_user_ns, current_fsgid()); break; case TOMOYO_TASK_PID: value = tomoyo_sys_getpid(); break; case TOMOYO_TASK_PPID: value = tomoyo_sys_getppid(); break; case TOMOYO_TYPE_IS_SOCKET: value = S_IFSOCK; break; case TOMOYO_TYPE_IS_SYMLINK: value = S_IFLNK; break; case TOMOYO_TYPE_IS_FILE: value = S_IFREG; break; case TOMOYO_TYPE_IS_BLOCK_DEV: value = S_IFBLK; break; case TOMOYO_TYPE_IS_DIRECTORY: value = S_IFDIR; break; case TOMOYO_TYPE_IS_CHAR_DEV: value = S_IFCHR; break; case TOMOYO_TYPE_IS_FIFO: value = S_IFIFO; break; case TOMOYO_MODE_SETUID: value = S_ISUID; break; case TOMOYO_MODE_SETGID: value = S_ISGID; break; case TOMOYO_MODE_STICKY: value = S_ISVTX; break; case TOMOYO_MODE_OWNER_READ: value = 0400; break; case TOMOYO_MODE_OWNER_WRITE: value = 0200; break; case TOMOYO_MODE_OWNER_EXECUTE: value = 0100; break; case TOMOYO_MODE_GROUP_READ: value = 0040; break; case TOMOYO_MODE_GROUP_WRITE: value = 0020; break; case TOMOYO_MODE_GROUP_EXECUTE: value = 0010; break; case TOMOYO_MODE_OTHERS_READ: value = 0004; break; case TOMOYO_MODE_OTHERS_WRITE: value = 0002; break; case TOMOYO_MODE_OTHERS_EXECUTE: value = 0001; break; case TOMOYO_EXEC_ARGC: if (!bprm) goto out; value = bprm->argc; break; case TOMOYO_EXEC_ENVC: if (!bprm) goto out; value = bprm->envc; break; case TOMOYO_NUMBER_UNION: /* Fetch values later. */ break; default: if (!obj) goto out; if (!obj->validate_done) { tomoyo_get_attributes(obj); obj->validate_done = true; } { u8 stat_index; struct tomoyo_mini_stat *stat; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH1_GID: case TOMOYO_PATH1_INO: case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH1_MINOR: case TOMOYO_PATH1_TYPE: case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH1_PERM: stat_index = TOMOYO_PATH1; break; case TOMOYO_PATH2_UID: case TOMOYO_PATH2_GID: case TOMOYO_PATH2_INO: case TOMOYO_PATH2_MAJOR: case TOMOYO_PATH2_MINOR: case TOMOYO_PATH2_TYPE: case TOMOYO_PATH2_DEV_MAJOR: case TOMOYO_PATH2_DEV_MINOR: case TOMOYO_PATH2_PERM: stat_index = TOMOYO_PATH2; break; case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH1_PARENT_PERM: stat_index = TOMOYO_PATH1_PARENT; break; case TOMOYO_PATH2_PARENT_UID: case TOMOYO_PATH2_PARENT_GID: case TOMOYO_PATH2_PARENT_INO: case TOMOYO_PATH2_PARENT_PERM: stat_index = TOMOYO_PATH2_PARENT; break; default: goto out; } if (!obj->stat_valid[stat_index]) goto out; stat = &obj->stat[stat_index]; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH2_UID: case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH2_PARENT_UID: value = from_kuid(&init_user_ns, stat->uid); break; case TOMOYO_PATH1_GID: case TOMOYO_PATH2_GID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH2_PARENT_GID: value = from_kgid(&init_user_ns, stat->gid); break; case TOMOYO_PATH1_INO: case TOMOYO_PATH2_INO: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH2_PARENT_INO: value = stat->ino; break; case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH2_MAJOR: value = MAJOR(stat->dev); break; case TOMOYO_PATH1_MINOR: case TOMOYO_PATH2_MINOR: value = MINOR(stat->dev); break; case TOMOYO_PATH1_TYPE: case TOMOYO_PATH2_TYPE: value = stat->mode & S_IFMT; break; case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH2_DEV_MAJOR: value = MAJOR(stat->rdev); break; case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH2_DEV_MINOR: value = MINOR(stat->rdev); break; case TOMOYO_PATH1_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PARENT_PERM: value = stat->mode & S_IALLUGO; break; } } break; } max_v[j] = value; min_v[j] = value; switch (index) { case TOMOYO_MODE_SETUID: case TOMOYO_MODE_SETGID: case TOMOYO_MODE_STICKY: case TOMOYO_MODE_OWNER_READ: case TOMOYO_MODE_OWNER_WRITE: case TOMOYO_MODE_OWNER_EXECUTE: case TOMOYO_MODE_GROUP_READ: case TOMOYO_MODE_GROUP_WRITE: case TOMOYO_MODE_GROUP_EXECUTE: case TOMOYO_MODE_OTHERS_READ: case TOMOYO_MODE_OTHERS_WRITE: case TOMOYO_MODE_OTHERS_EXECUTE: is_bitop[j] = true; } } if (left == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; min_v[0] = ptr->values[0]; max_v[0] = ptr->values[1]; } if (right == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; if (ptr->group) { if (tomoyo_number_matches_group(min_v[0], max_v[0], ptr->group) == match) continue; } else { if ((min_v[0] <= ptr->values[1] && max_v[0] >= ptr->values[0]) == match) continue; } goto out; } /* * Bit operation is valid only when counterpart value * represents permission. */ if (is_bitop[0] && is_bitop[1]) { goto out; } else if (is_bitop[0]) { switch (right) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } else if (is_bitop[1]) { switch (left) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } /* Normal value range comparison. */ if ((min_v[0] <= max_v[1] && max_v[0] >= min_v[1]) == match) continue; out: return false; } /* Check argv[] and envp[] now. */ if (r->ee && (argc || envc)) return tomoyo_scan_bprm(r->ee, argc, argv, envc, envp); return true; }
2 2 2 2 2 2 483 483 483 483 483 483 483 483 1 1 1 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 // SPDX-License-Identifier: GPL-2.0 /* * sysctl_net_ipv6.c: sysctl interface to net IPV6 subsystem. * * Changes: * YOSHIFUJI Hideaki @USAGI: added icmp sysctl table. */ #include <linux/mm.h> #include <linux/sysctl.h> #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/slab.h> #include <linux/export.h> #include <net/ndisc.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/inet_frag.h> #include <net/netevent.h> #include <net/ip_fib.h> #ifdef CONFIG_NETLABEL #include <net/calipso.h> #endif #include <linux/ioam6.h> static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static u32 rt6_multipath_hash_fields_all_mask = FIB_MULTIPATH_HASH_FIELD_ALL_MASK; static u32 ioam6_id_max = IOAM6_DEFAULT_ID; static u64 ioam6_id_wide_max = IOAM6_DEFAULT_ID_WIDE; static int proc_rt6_multipath_hash_policy(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; net = container_of(table->data, struct net, ipv6.sysctl.multipath_hash_policy); ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); return ret; } static int proc_rt6_multipath_hash_fields(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; net = container_of(table->data, struct net, ipv6.sysctl.multipath_hash_fields); ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); return ret; } static struct ctl_table ipv6_table_template[] = { { .procname = "bindv6only", .data = &init_net.ipv6.sysctl.bindv6only, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "anycast_src_echo_reply", .data = &init_net.ipv6.sysctl.anycast_src_echo_reply, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "flowlabel_consistency", .data = &init_net.ipv6.sysctl.flowlabel_consistency, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "auto_flowlabels", .data = &init_net.ipv6.sysctl.auto_flowlabels, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra2 = &auto_flowlabels_max }, { .procname = "fwmark_reflect", .data = &init_net.ipv6.sysctl.fwmark_reflect, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "idgen_retries", .data = &init_net.ipv6.sysctl.idgen_retries, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "idgen_delay", .data = &init_net.ipv6.sysctl.idgen_delay, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "flowlabel_state_ranges", .data = &init_net.ipv6.sysctl.flowlabel_state_ranges, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_nonlocal_bind", .data = &init_net.ipv6.sysctl.ip_nonlocal_bind, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "flowlabel_reflect", .data = &init_net.ipv6.sysctl.flowlabel_reflect, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &flowlabel_reflect_max, }, { .procname = "max_dst_opts_number", .data = &init_net.ipv6.sysctl.max_dst_opts_cnt, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_hbh_opts_number", .data = &init_net.ipv6.sysctl.max_hbh_opts_cnt, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_dst_opts_length", .data = &init_net.ipv6.sysctl.max_dst_opts_len, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_hbh_length", .data = &init_net.ipv6.sysctl.max_hbh_opts_len, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "fib_multipath_hash_policy", .data = &init_net.ipv6.sysctl.multipath_hash_policy, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_rt6_multipath_hash_policy, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "fib_multipath_hash_fields", .data = &init_net.ipv6.sysctl.multipath_hash_fields, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_rt6_multipath_hash_fields, .extra1 = SYSCTL_ONE, .extra2 = &rt6_multipath_hash_fields_all_mask, }, { .procname = "seg6_flowlabel", .data = &init_net.ipv6.sysctl.seg6_flowlabel, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "fib_notify_on_flag_change", .data = &init_net.ipv6.sysctl.fib_notify_on_flag_change, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "ioam6_id", .data = &init_net.ipv6.sysctl.ioam6_id, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra2 = &ioam6_id_max, }, { .procname = "ioam6_id_wide", .data = &init_net.ipv6.sysctl.ioam6_id_wide, .maxlen = sizeof(u64), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra2 = &ioam6_id_wide_max, }, }; static struct ctl_table ipv6_rotable[] = { { .procname = "mld_max_msf", .data = &sysctl_mld_max_msf, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "mld_qrv", .data = &sysctl_mld_qrv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, #ifdef CONFIG_NETLABEL { .procname = "calipso_cache_enable", .data = &calipso_cache_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "calipso_cache_bucket_size", .data = &calipso_cache_bucketsize, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif /* CONFIG_NETLABEL */ }; static int __net_init ipv6_sysctl_net_init(struct net *net) { size_t table_size = ARRAY_SIZE(ipv6_table_template); struct ctl_table *ipv6_table; struct ctl_table *ipv6_route_table; struct ctl_table *ipv6_icmp_table; int err, i; err = -ENOMEM; ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template), GFP_KERNEL); if (!ipv6_table) goto out; /* Update the variables to point into the current struct net */ for (i = 0; i < table_size; i++) ipv6_table[i].data += (void *)net - (void *)&init_net; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) goto out_ipv6_table; ipv6_icmp_table = ipv6_icmp_sysctl_init(net); if (!ipv6_icmp_table) goto out_ipv6_route_table; net->ipv6.sysctl.hdr = register_net_sysctl_sz(net, "net/ipv6", ipv6_table, table_size); if (!net->ipv6.sysctl.hdr) goto out_ipv6_icmp_table; net->ipv6.sysctl.route_hdr = register_net_sysctl_sz(net, "net/ipv6/route", ipv6_route_table, ipv6_route_sysctl_table_size(net)); if (!net->ipv6.sysctl.route_hdr) goto out_unregister_ipv6_table; net->ipv6.sysctl.icmp_hdr = register_net_sysctl_sz(net, "net/ipv6/icmp", ipv6_icmp_table, ipv6_icmp_sysctl_table_size()); if (!net->ipv6.sysctl.icmp_hdr) goto out_unregister_route_table; err = 0; out: return err; out_unregister_route_table: unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr); out_unregister_ipv6_table: unregister_net_sysctl_table(net->ipv6.sysctl.hdr); out_ipv6_icmp_table: kfree(ipv6_icmp_table); out_ipv6_route_table: kfree(ipv6_route_table); out_ipv6_table: kfree(ipv6_table); goto out; } static void __net_exit ipv6_sysctl_net_exit(struct net *net) { const struct ctl_table *ipv6_table; const struct ctl_table *ipv6_route_table; const struct ctl_table *ipv6_icmp_table; ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg; ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg; ipv6_icmp_table = net->ipv6.sysctl.icmp_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.icmp_hdr); unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr); unregister_net_sysctl_table(net->ipv6.sysctl.hdr); kfree(ipv6_table); kfree(ipv6_route_table); kfree(ipv6_icmp_table); } static struct pernet_operations ipv6_sysctl_net_ops = { .init = ipv6_sysctl_net_init, .exit = ipv6_sysctl_net_exit, }; static struct ctl_table_header *ip6_header; int ipv6_sysctl_register(void) { int err = -ENOMEM; ip6_header = register_net_sysctl(&init_net, "net/ipv6", ipv6_rotable); if (!ip6_header) goto out; err = register_pernet_subsys(&ipv6_sysctl_net_ops); if (err) goto err_pernet; out: return err; err_pernet: unregister_net_sysctl_table(ip6_header); goto out; } void ipv6_sysctl_unregister(void) { unregister_net_sysctl_table(ip6_header); unregister_pernet_subsys(&ipv6_sysctl_net_ops); }
1 1 2 2 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 // SPDX-License-Identifier: GPL-2.0-or-later /* * super.c * * load/unload driver, mount/dismount volumes * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #include <linux/module.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/random.h> #include <linux/statfs.h> #include <linux/moduleparam.h> #include <linux/blkdev.h> #include <linux/socket.h> #include <linux/inet.h> #include <linux/fs_parser.h> #include <linux/fs_context.h> #include <linux/crc32.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/quotaops.h> #include <linux/signal.h> #define CREATE_TRACE_POINTS #include "ocfs2_trace.h" #include <cluster/masklog.h> #include "ocfs2.h" /* this should be the only file to include a version 1 header */ #include "ocfs1_fs_compat.h" #include "alloc.h" #include "aops.h" #include "blockcheck.h" #include "dlmglue.h" #include "export.h" #include "extent_map.h" #include "heartbeat.h" #include "inode.h" #include "journal.h" #include "localalloc.h" #include "namei.h" #include "slot_map.h" #include "super.h" #include "sysfile.h" #include "uptodate.h" #include "xattr.h" #include "quota.h" #include "refcounttree.h" #include "suballoc.h" #include "buffer_head_io.h" #include "filecheck.h" static struct kmem_cache *ocfs2_inode_cachep; struct kmem_cache *ocfs2_dquot_cachep; struct kmem_cache *ocfs2_qf_chunk_cachep; static struct dentry *ocfs2_debugfs_root; MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("OCFS2 cluster file system"); struct mount_options { unsigned long commit_interval; unsigned long mount_opt; unsigned int atime_quantum; unsigned short slot; int localalloc_opt; unsigned int resv_level; int dir_resv_level; char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; bool user_stack; }; static int ocfs2_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ocfs2_check_set_options(struct super_block *sb, struct mount_options *options); static int ocfs2_show_options(struct seq_file *s, struct dentry *root); static void ocfs2_put_super(struct super_block *sb); static int ocfs2_mount_volume(struct super_block *sb); static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); static int ocfs2_initialize_mem_caches(void); static void ocfs2_free_mem_caches(void); static void ocfs2_delete_osb(struct ocfs2_super *osb); static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf); static int ocfs2_sync_fs(struct super_block *sb, int wait); static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); static void ocfs2_release_system_inodes(struct ocfs2_super *osb); static int ocfs2_check_volume(struct ocfs2_super *osb); static int ocfs2_verify_volume(struct ocfs2_dinode *di, struct buffer_head *bh, u32 sectsize, struct ocfs2_blockcheck_stats *stats); static int ocfs2_initialize_super(struct super_block *sb, struct buffer_head *bh, int sector_size, struct ocfs2_blockcheck_stats *stats); static int ocfs2_get_sector(struct super_block *sb, struct buffer_head **bh, int block, int sect_size); static struct inode *ocfs2_alloc_inode(struct super_block *sb); static void ocfs2_free_inode(struct inode *inode); static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); static int ocfs2_enable_quotas(struct ocfs2_super *osb); static void ocfs2_disable_quotas(struct ocfs2_super *osb); static struct dquot __rcu **ocfs2_get_dquots(struct inode *inode) { return OCFS2_I(inode)->i_dquot; } static const struct super_operations ocfs2_sops = { .statfs = ocfs2_statfs, .alloc_inode = ocfs2_alloc_inode, .free_inode = ocfs2_free_inode, .drop_inode = inode_just_drop, .evict_inode = ocfs2_evict_inode, .sync_fs = ocfs2_sync_fs, .put_super = ocfs2_put_super, .show_options = ocfs2_show_options, .quota_read = ocfs2_quota_read, .quota_write = ocfs2_quota_write, .get_dquots = ocfs2_get_dquots, }; enum { Opt_barrier, Opt_errors, Opt_intr, Opt_heartbeat, Opt_data, Opt_atime_quantum, Opt_slot, Opt_commit, Opt_localalloc, Opt_localflocks, Opt_stack, Opt_user_xattr, Opt_inode64, Opt_acl, Opt_usrquota, Opt_grpquota, Opt_coherency, Opt_resv_level, Opt_dir_resv_level, Opt_journal_async_commit, }; static const struct constant_table ocfs2_param_errors[] = { {"panic", OCFS2_MOUNT_ERRORS_PANIC}, {"remount-ro", OCFS2_MOUNT_ERRORS_ROFS}, {"continue", OCFS2_MOUNT_ERRORS_CONT}, {} }; static const struct constant_table ocfs2_param_heartbeat[] = { {"local", OCFS2_MOUNT_HB_LOCAL}, {"none", OCFS2_MOUNT_HB_NONE}, {"global", OCFS2_MOUNT_HB_GLOBAL}, {} }; static const struct constant_table ocfs2_param_data[] = { {"writeback", OCFS2_MOUNT_DATA_WRITEBACK}, {"ordered", 0}, {} }; static const struct constant_table ocfs2_param_coherency[] = { {"buffered", OCFS2_MOUNT_COHERENCY_BUFFERED}, {"full", 0}, {} }; static const struct fs_parameter_spec ocfs2_param_spec[] = { fsparam_u32 ("barrier", Opt_barrier), fsparam_enum ("errors", Opt_errors, ocfs2_param_errors), fsparam_flag_no ("intr", Opt_intr), fsparam_enum ("heartbeat", Opt_heartbeat, ocfs2_param_heartbeat), fsparam_enum ("data", Opt_data, ocfs2_param_data), fsparam_u32 ("atime_quantum", Opt_atime_quantum), fsparam_u32 ("preferred_slot", Opt_slot), fsparam_u32 ("commit", Opt_commit), fsparam_s32 ("localalloc", Opt_localalloc), fsparam_flag ("localflocks", Opt_localflocks), fsparam_string ("cluster_stack", Opt_stack), fsparam_flag_no ("user_xattr", Opt_user_xattr), fsparam_flag ("inode64", Opt_inode64), fsparam_flag_no ("acl", Opt_acl), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("grpquota", Opt_grpquota), fsparam_enum ("coherency", Opt_coherency, ocfs2_param_coherency), fsparam_u32 ("resv_level", Opt_resv_level), fsparam_u32 ("dir_resv_level", Opt_dir_resv_level), fsparam_flag ("journal_async_commit", Opt_journal_async_commit), {} }; #ifdef CONFIG_DEBUG_FS static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) { struct ocfs2_cluster_connection *cconn = osb->cconn; struct ocfs2_recovery_map *rm = osb->recovery_map; struct ocfs2_orphan_scan *os = &osb->osb_orphan_scan; int i, out = 0; unsigned long flags; out += scnprintf(buf + out, len - out, "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", "Device", osb->dev_str, osb->uuid_str, osb->fs_generation, osb->vol_label); out += scnprintf(buf + out, len - out, "%10s => State: %d Flags: 0x%lX\n", "Volume", atomic_read(&osb->vol_state), osb->osb_flags); out += scnprintf(buf + out, len - out, "%10s => Block: %lu Cluster: %d\n", "Sizes", osb->sb->s_blocksize, osb->s_clustersize); out += scnprintf(buf + out, len - out, "%10s => Compat: 0x%X Incompat: 0x%X " "ROcompat: 0x%X\n", "Features", osb->s_feature_compat, osb->s_feature_incompat, osb->s_feature_ro_compat); out += scnprintf(buf + out, len - out, "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount", osb->s_mount_opt, osb->s_atime_quantum); if (cconn) { out += scnprintf(buf + out, len - out, "%10s => Stack: %s Name: %*s " "Version: %d.%d\n", "Cluster", (*osb->osb_cluster_stack == '\0' ? "o2cb" : osb->osb_cluster_stack), cconn->cc_namelen, cconn->cc_name, cconn->cc_version.pv_major, cconn->cc_version.pv_minor); } spin_lock_irqsave(&osb->dc_task_lock, flags); out += scnprintf(buf + out, len - out, "%10s => Pid: %d Count: %lu WakeSeq: %lu " "WorkSeq: %lu\n", "DownCnvt", (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), osb->blocked_lock_count, osb->dc_wake_sequence, osb->dc_work_sequence); spin_unlock_irqrestore(&osb->dc_task_lock, flags); spin_lock(&osb->osb_lock); out += scnprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", "Recovery", (osb->recovery_thread_task ? task_pid_nr(osb->recovery_thread_task) : -1)); if (rm->rm_used == 0) out += scnprintf(buf + out, len - out, " None\n"); else { for (i = 0; i < rm->rm_used; i++) out += scnprintf(buf + out, len - out, " %d", rm->rm_entries[i]); out += scnprintf(buf + out, len - out, "\n"); } spin_unlock(&osb->osb_lock); out += scnprintf(buf + out, len - out, "%10s => Pid: %d Interval: %lu\n", "Commit", (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), osb->osb_commit_interval); out += scnprintf(buf + out, len - out, "%10s => State: %d TxnId: %lu NumTxns: %d\n", "Journal", osb->journal->j_state, osb->journal->j_trans_id, atomic_read(&osb->journal->j_num_trans)); out += scnprintf(buf + out, len - out, "%10s => GlobalAllocs: %d LocalAllocs: %d " "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n", "Stats", atomic_read(&osb->alloc_stats.bitmap_data), atomic_read(&osb->alloc_stats.local_data), atomic_read(&osb->alloc_stats.bg_allocs), atomic_read(&osb->alloc_stats.moves), atomic_read(&osb->alloc_stats.bg_extends)); out += scnprintf(buf + out, len - out, "%10s => State: %u Descriptor: %llu Size: %u bits " "Default: %u bits\n", "LocalAlloc", osb->local_alloc_state, (unsigned long long)osb->la_last_gd, osb->local_alloc_bits, osb->local_alloc_default_bits); spin_lock(&osb->osb_lock); out += scnprintf(buf + out, len - out, "%10s => InodeSlot: %d StolenInodes: %d, " "MetaSlot: %d StolenMeta: %d\n", "Steal", osb->s_inode_steal_slot, atomic_read(&osb->s_num_inodes_stolen), osb->s_meta_steal_slot, atomic_read(&osb->s_num_meta_stolen)); spin_unlock(&osb->osb_lock); out += scnprintf(buf + out, len - out, "OrphanScan => "); out += scnprintf(buf + out, len - out, "Local: %u Global: %u ", os->os_count, os->os_seqno); out += scnprintf(buf + out, len - out, " Last Scan: "); if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) out += scnprintf(buf + out, len - out, "Disabled\n"); else out += scnprintf(buf + out, len - out, "%lu seconds ago\n", (unsigned long)(ktime_get_seconds() - os->os_scantime)); out += scnprintf(buf + out, len - out, "%10s => %3s %10s\n", "Slots", "Num", "RecoGen"); for (i = 0; i < osb->max_slots; ++i) { out += scnprintf(buf + out, len - out, "%10s %c %3d %10d\n", " ", (i == osb->slot_num ? '*' : ' '), i, osb->slot_recovery_generations[i]); } return out; } static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) { struct ocfs2_super *osb = inode->i_private; char *buf = NULL; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) goto bail; i_size_write(inode, ocfs2_osb_dump(osb, buf, PAGE_SIZE)); file->private_data = buf; return 0; bail: return -ENOMEM; } static int ocfs2_debug_release(struct inode *inode, struct file *file) { kfree(file->private_data); return 0; } static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, i_size_read(file->f_mapping->host)); } #else static int ocfs2_osb_debug_open(struct inode *inode, struct file *file) { return 0; } static int ocfs2_debug_release(struct inode *inode, struct file *file) { return 0; } static ssize_t ocfs2_debug_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { return 0; } #endif /* CONFIG_DEBUG_FS */ static const struct file_operations ocfs2_osb_debug_fops = { .open = ocfs2_osb_debug_open, .release = ocfs2_debug_release, .read = ocfs2_debug_read, .llseek = generic_file_llseek, }; static int ocfs2_sync_fs(struct super_block *sb, int wait) { int status; tid_t target; struct ocfs2_super *osb = OCFS2_SB(sb); if (ocfs2_is_hard_readonly(osb)) return -EROFS; if (wait) { status = ocfs2_flush_truncate_log(osb); if (status < 0) mlog_errno(status); } else { ocfs2_schedule_truncate_log_flush(osb, 0); } if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { if (wait) jbd2_log_wait_commit(osb->journal->j_journal, target); } return 0; } static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino) { if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA) && (ino == USER_QUOTA_SYSTEM_INODE || ino == LOCAL_USER_QUOTA_SYSTEM_INODE)) return 0; if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) && (ino == GROUP_QUOTA_SYSTEM_INODE || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE)) return 0; return 1; } static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) { struct inode *new = NULL; int status = 0; int i; new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0); if (IS_ERR(new)) { status = PTR_ERR(new); mlog_errno(status); goto bail; } osb->root_inode = new; new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE, 0); if (IS_ERR(new)) { status = PTR_ERR(new); mlog_errno(status); goto bail; } osb->sys_root_inode = new; for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { if (!ocfs2_need_system_inode(osb, i)) continue; new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog_errno(status); mlog(ML_ERROR, "Unable to load system inode %d, " "possibly corrupt fs?", i); goto bail; } // the array now has one ref, so drop this one iput(new); } bail: if (status) mlog_errno(status); return status; } static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) { struct inode *new = NULL; int status = 0; int i; for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; i < NUM_SYSTEM_INODES; i++) { if (!ocfs2_need_system_inode(osb, i)) continue; new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", status, i, osb->slot_num); goto bail; } /* the array now has one ref, so drop this one */ iput(new); } bail: if (status) mlog_errno(status); return status; } static void ocfs2_release_system_inodes(struct ocfs2_super *osb) { int i; struct inode *inode; for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) { inode = osb->global_system_inodes[i]; if (inode) { iput(inode); osb->global_system_inodes[i] = NULL; } } inode = osb->sys_root_inode; if (inode) { iput(inode); osb->sys_root_inode = NULL; } inode = osb->root_inode; if (inode) { iput(inode); osb->root_inode = NULL; } if (!osb->local_system_inodes) return; for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) { if (osb->local_system_inodes[i]) { iput(osb->local_system_inodes[i]); osb->local_system_inodes[i] = NULL; } } kfree(osb->local_system_inodes); osb->local_system_inodes = NULL; } /* We're allocating fs objects, use GFP_NOFS */ static struct inode *ocfs2_alloc_inode(struct super_block *sb) { struct ocfs2_inode_info *oi; oi = alloc_inode_sb(sb, ocfs2_inode_cachep, GFP_NOFS); if (!oi) return NULL; oi->i_sync_tid = 0; oi->i_datasync_tid = 0; memset(&oi->i_dquot, 0, sizeof(oi->i_dquot)); jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); return &oi->vfs_inode; } static void ocfs2_free_inode(struct inode *inode) { kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); } static unsigned long long ocfs2_max_file_offset(unsigned int bbits, unsigned int cbits) { unsigned int bytes = 1 << cbits; unsigned int trim = bytes; unsigned int bitshift = 32; /* * i_size and all block offsets in ocfs2 are always 64 bits * wide. i_clusters is 32 bits, in cluster-sized units. So on * 64 bit platforms, cluster size will be the limiting factor. */ #if BITS_PER_LONG == 32 BUILD_BUG_ON(sizeof(sector_t) != 8); /* * We might be limited by page cache size. */ if (bytes > PAGE_SIZE) { bytes = PAGE_SIZE; trim = 1; /* * Shift by 31 here so that we don't get larger than * MAX_LFS_FILESIZE */ bitshift = 31; } #endif /* * Trim by a whole cluster when we can actually approach the * on-disk limits. Otherwise we can overflow i_clusters when * an extent start is at the max offset. */ return (((unsigned long long)bytes) << bitshift) - trim; } static int ocfs2_reconfigure(struct fs_context *fc) { int incompat_features; int ret = 0; struct mount_options *parsed_options = fc->fs_private; struct super_block *sb = fc->root->d_sb; struct ocfs2_super *osb = OCFS2_SB(sb); u32 tmp; sync_filesystem(sb); if (!ocfs2_check_set_options(sb, parsed_options)) { ret = -EINVAL; goto out; } tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; if ((osb->s_mount_opt & tmp) != (parsed_options->mount_opt & tmp)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); goto out; } if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != (parsed_options->mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot change data mode on remount\n"); goto out; } /* Probably don't want this on remount; it might * mess with other nodes */ if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) && (parsed_options->mount_opt & OCFS2_MOUNT_INODE64)) { ret = -EINVAL; mlog(ML_ERROR, "Cannot enable inode64 on remount\n"); goto out; } /* We're going to/from readonly mode. */ if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { /* Disable quota accounting before remounting RO */ if (fc->sb_flags & SB_RDONLY) { ret = ocfs2_susp_quotas(osb, 0); if (ret < 0) goto out; } /* Lock here so the check of HARD_RO and the potential * setting of SOFT_RO is atomic. */ spin_lock(&osb->osb_lock); if (osb->osb_flags & OCFS2_OSB_HARD_RO) { mlog(ML_ERROR, "Remount on readonly device is forbidden.\n"); ret = -EROFS; goto unlock_osb; } if (fc->sb_flags & SB_RDONLY) { sb->s_flags |= SB_RDONLY; osb->osb_flags |= OCFS2_OSB_SOFT_RO; } else { if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { mlog(ML_ERROR, "Cannot remount RDWR " "filesystem due to previous errors.\n"); ret = -EROFS; goto unlock_osb; } incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP); if (incompat_features) { mlog(ML_ERROR, "Cannot remount RDWR because " "of unsupported optional features " "(%x).\n", incompat_features); ret = -EINVAL; goto unlock_osb; } sb->s_flags &= ~SB_RDONLY; osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; } trace_ocfs2_remount(sb->s_flags, osb->osb_flags, fc->sb_flags); unlock_osb: spin_unlock(&osb->osb_lock); /* Enable quota accounting after remounting RW */ if (!ret && !(fc->sb_flags & SB_RDONLY)) { if (sb_any_quota_suspended(sb)) ret = ocfs2_susp_quotas(osb, 1); else ret = ocfs2_enable_quotas(osb); if (ret < 0) { /* Return back changes... */ spin_lock(&osb->osb_lock); sb->s_flags |= SB_RDONLY; osb->osb_flags |= OCFS2_OSB_SOFT_RO; spin_unlock(&osb->osb_lock); goto out; } } } if (!ret) { /* Only save off the new mount options in case of a successful * remount. */ osb->s_mount_opt = parsed_options->mount_opt; osb->s_atime_quantum = parsed_options->atime_quantum; osb->preferred_slot = parsed_options->slot; if (parsed_options->commit_interval) osb->osb_commit_interval = parsed_options->commit_interval; if (!ocfs2_is_hard_readonly(osb)) ocfs2_set_journal_params(osb); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0); } out: return ret; } static int ocfs2_sb_probe(struct super_block *sb, struct buffer_head **bh, int *sector_size, struct ocfs2_blockcheck_stats *stats) { int status, tmpstat; struct ocfs1_vol_disk_hdr *hdr; struct ocfs2_dinode *di; int blksize; *bh = NULL; /* may be > 512 */ *sector_size = bdev_logical_block_size(sb->s_bdev); if (*sector_size > OCFS2_MAX_BLOCKSIZE) { mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", *sector_size, OCFS2_MAX_BLOCKSIZE); status = -EINVAL; goto bail; } /* Can this really happen? */ if (*sector_size < OCFS2_MIN_BLOCKSIZE) *sector_size = OCFS2_MIN_BLOCKSIZE; /* check block zero for old format */ status = ocfs2_get_sector(sb, bh, 0, *sector_size); if (status < 0) { mlog_errno(status); goto bail; } hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data; if (hdr->major_version == OCFS1_MAJOR_VERSION) { mlog(ML_ERROR, "incompatible version: %u.%u\n", hdr->major_version, hdr->minor_version); status = -EINVAL; } if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE, strlen(OCFS1_VOLUME_SIGNATURE)) == 0) { mlog(ML_ERROR, "incompatible volume signature: %8s\n", hdr->signature); status = -EINVAL; } brelse(*bh); *bh = NULL; if (status < 0) { mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be " "upgraded before mounting with ocfs v2\n"); goto bail; } /* * Now check at magic offset for 512, 1024, 2048, 4096 * blocksizes. 4096 is the maximum blocksize because it is * the minimum clustersize. */ status = -EINVAL; for (blksize = *sector_size; blksize <= OCFS2_MAX_BLOCKSIZE; blksize <<= 1) { tmpstat = ocfs2_get_sector(sb, bh, OCFS2_SUPER_BLOCK_BLKNO, blksize); if (tmpstat < 0) { status = tmpstat; mlog_errno(status); break; } di = (struct ocfs2_dinode *) (*bh)->b_data; memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); spin_lock_init(&stats->b_lock); tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats); if (tmpstat < 0) { brelse(*bh); *bh = NULL; } if (tmpstat != -EAGAIN) { status = tmpstat; break; } } bail: return status; } static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) { u32 hb_enabled = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL; if (osb->s_mount_opt & hb_enabled) { if (ocfs2_mount_local(osb)) { mlog(ML_ERROR, "Cannot heartbeat on a locally " "mounted device.\n"); return -EINVAL; } if (ocfs2_userspace_stack(osb)) { mlog(ML_ERROR, "Userspace stack expected, but " "o2cb heartbeat arguments passed to mount\n"); return -EINVAL; } if (((osb->s_mount_opt & OCFS2_MOUNT_HB_GLOBAL) && !ocfs2_cluster_o2cb_global_heartbeat(osb)) || ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) && ocfs2_cluster_o2cb_global_heartbeat(osb))) { mlog(ML_ERROR, "Mismatching o2cb heartbeat modes\n"); return -EINVAL; } } if (!(osb->s_mount_opt & hb_enabled)) { if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && !ocfs2_userspace_stack(osb)) { mlog(ML_ERROR, "Heartbeat has to be started to mount " "a read-write clustered device.\n"); return -EINVAL; } } return 0; } /* * If we're using a userspace stack, mount should have passed * a name that matches the disk. If not, mount should not * have passed a stack. */ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, struct mount_options *mopt) { if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) { mlog(ML_ERROR, "cluster stack passed to mount, but this filesystem " "does not support it\n"); return -EINVAL; } if (ocfs2_userspace_stack(osb) && strncmp(osb->osb_cluster_stack, mopt->cluster_stack, OCFS2_STACK_LABEL_LEN)) { mlog(ML_ERROR, "cluster stack passed to mount (\"%s\") does not " "match the filesystem (\"%s\")\n", mopt->cluster_stack, osb->osb_cluster_stack); return -EINVAL; } return 0; } static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) { int type; struct super_block *sb = osb->sb; unsigned int feature[OCFS2_MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; int status = 0; for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; if (unsuspend) status = dquot_resume(sb, type); else { struct ocfs2_mem_dqinfo *oinfo; /* Cancel periodic syncing before suspending */ oinfo = sb_dqinfo(sb, type)->dqi_priv; cancel_delayed_work_sync(&oinfo->dqi_sync_work); status = dquot_suspend(sb, type); } if (status < 0) break; } if (status < 0) mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on " "remount (error = %d).\n", status); return status; } static int ocfs2_enable_quotas(struct ocfs2_super *osb) { struct inode *inode[OCFS2_MAXQUOTAS] = { NULL, NULL }; struct super_block *sb = osb->sb; unsigned int feature[OCFS2_MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE }; int status; int type; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE; for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; inode[type] = ocfs2_get_system_file_inode(osb, ino[type], osb->slot_num); if (!inode[type]) { status = -ENOENT; goto out_quota_off; } status = dquot_load_quota_inode(inode[type], type, QFMT_OCFS2, DQUOT_USAGE_ENABLED); if (status < 0) goto out_quota_off; } for (type = 0; type < OCFS2_MAXQUOTAS; type++) iput(inode[type]); return 0; out_quota_off: ocfs2_disable_quotas(osb); for (type = 0; type < OCFS2_MAXQUOTAS; type++) iput(inode[type]); mlog_errno(status); return status; } static void ocfs2_disable_quotas(struct ocfs2_super *osb) { int type; struct inode *inode; struct super_block *sb = osb->sb; struct ocfs2_mem_dqinfo *oinfo; /* We mostly ignore errors in this function because there's not much * we can do when we see them */ for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!sb_has_quota_loaded(sb, type)) continue; if (!sb_has_quota_suspended(sb, type)) { oinfo = sb_dqinfo(sb, type)->dqi_priv; cancel_delayed_work_sync(&oinfo->dqi_sync_work); } inode = igrab(sb->s_dquot.files[type]); /* Turn off quotas. This will remove all dquot structures from * memory and so they will be automatically synced to global * quota files */ dquot_disable(sb, type, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); iput(inode); } } static int ocfs2_fill_super(struct super_block *sb, struct fs_context *fc) { struct dentry *root; int status, sector_size; struct mount_options *parsed_options = fc->fs_private; struct inode *inode = NULL; struct ocfs2_super *osb = NULL; struct buffer_head *bh = NULL; char nodestr[12]; struct ocfs2_blockcheck_stats stats; trace_ocfs2_fill_super(sb, fc, fc->sb_flags & SB_SILENT); /* probe for superblock */ status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats); if (status < 0) { mlog(ML_ERROR, "superblock probe failed!\n"); goto out; } status = ocfs2_initialize_super(sb, bh, sector_size, &stats); brelse(bh); bh = NULL; if (status < 0) goto out; osb = OCFS2_SB(sb); if (!ocfs2_check_set_options(sb, parsed_options)) { status = -EINVAL; goto out_super; } osb->s_mount_opt = parsed_options->mount_opt; osb->s_atime_quantum = parsed_options->atime_quantum; osb->preferred_slot = parsed_options->slot; osb->osb_commit_interval = parsed_options->commit_interval; ocfs2_la_set_sizes(osb, parsed_options->localalloc_opt); osb->osb_resv_level = parsed_options->resv_level; osb->osb_dir_resv_level = parsed_options->resv_level; if (parsed_options->dir_resv_level == -1) osb->osb_dir_resv_level = parsed_options->resv_level; else osb->osb_dir_resv_level = parsed_options->dir_resv_level; status = ocfs2_verify_userspace_stack(osb, parsed_options); if (status) goto out_super; sb->s_magic = OCFS2_SUPER_MAGIC; sb->s_flags = (sb->s_flags & ~(SB_POSIXACL | SB_NOSEC)) | ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0); /* Hard readonly mode only if: bdev_read_only, SB_RDONLY, * heartbeat=none */ if (bdev_read_only(sb->s_bdev)) { if (!sb_rdonly(sb)) { status = -EACCES; mlog(ML_ERROR, "Readonly device detected but readonly " "mount was not specified.\n"); goto out_super; } /* You should not be able to start a local heartbeat * on a readonly device. */ if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { status = -EROFS; mlog(ML_ERROR, "Local heartbeat specified on readonly " "device.\n"); goto out_super; } status = ocfs2_check_journals_nolocks(osb); if (status < 0) { if (status == -EROFS) mlog(ML_ERROR, "Recovery required on readonly " "file system, but write access is " "unavailable.\n"); goto out_super; } ocfs2_set_ro_flag(osb, 1); printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. " "Cluster services will not be used for this mount. " "Recovery will be skipped.\n", osb->dev_str); } if (!ocfs2_is_hard_readonly(osb)) { if (sb_rdonly(sb)) ocfs2_set_ro_flag(osb, 0); } status = ocfs2_verify_heartbeat(osb); if (status < 0) goto out_super; osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, osb, &ocfs2_osb_debug_fops); if (ocfs2_meta_ecc(osb)) { ocfs2_initialize_journal_triggers(sb, osb->s_journal_triggers); ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, osb->osb_debug_root); } status = ocfs2_mount_volume(sb); if (status < 0) goto out_debugfs; if (osb->root_inode) inode = igrab(osb->root_inode); if (!inode) { status = -EIO; goto out_dismount; } osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); if (!osb->osb_dev_kset) { status = -ENOMEM; mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id); goto out_dismount; } /* Create filecheck sysfs related directories/files at * /sys/fs/ocfs2/<devname>/filecheck */ if (ocfs2_filecheck_create_sysfs(osb)) { status = -ENOMEM; mlog(ML_ERROR, "Unable to create filecheck sysfs directory at " "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id); goto out_dismount; } root = d_make_root(inode); if (!root) { status = -ENOMEM; goto out_dismount; } sb->s_root = root; ocfs2_complete_mount_recovery(osb); if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " "with %s data mode.\n", osb->dev_str, nodestr, osb->slot_num, osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); atomic_set(&osb->vol_state, VOLUME_MOUNTED); wake_up(&osb->osb_mount_event); /* Now we can initialize quotas because we can afford to wait * for cluster locks recovery now. That also means that truncation * log recovery can happen but that waits for proper quota setup */ if (!sb_rdonly(sb)) { status = ocfs2_enable_quotas(osb); if (status < 0) { /* We have to err-out specially here because * s_root is already set */ mlog_errno(status); atomic_set(&osb->vol_state, VOLUME_DISABLED); wake_up(&osb->osb_mount_event); return status; } } ocfs2_complete_quota_recovery(osb); /* Now we wake up again for processes waiting for quotas */ atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS); wake_up(&osb->osb_mount_event); /* Start this when the mount is almost sure of being successful */ ocfs2_orphan_scan_start(osb); return status; out_dismount: atomic_set(&osb->vol_state, VOLUME_DISABLED); wake_up(&osb->osb_mount_event); ocfs2_free_replay_slots(osb); ocfs2_dismount_volume(sb, 1); goto out; out_debugfs: debugfs_remove_recursive(osb->osb_debug_root); out_super: ocfs2_release_system_inodes(osb); kfree(osb->recovery_map); ocfs2_delete_osb(osb); kfree(osb); out: mlog_errno(status); return status; } static int ocfs2_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, ocfs2_fill_super); } static void ocfs2_free_fc(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations ocfs2_context_ops = { .parse_param = ocfs2_parse_param, .get_tree = ocfs2_get_tree, .reconfigure = ocfs2_reconfigure, .free = ocfs2_free_fc, }; static int ocfs2_init_fs_context(struct fs_context *fc) { struct mount_options *mopt; mopt = kzalloc(sizeof(struct mount_options), GFP_KERNEL); if (!mopt) return -EINVAL; mopt->commit_interval = 0; mopt->mount_opt = OCFS2_MOUNT_NOINTR; mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; mopt->slot = OCFS2_INVALID_SLOT; mopt->localalloc_opt = -1; mopt->cluster_stack[0] = '\0'; mopt->resv_level = OCFS2_DEFAULT_RESV_LEVEL; mopt->dir_resv_level = -1; fc->fs_private = mopt; fc->ops = &ocfs2_context_ops; return 0; } static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL, .init_fs_context = ocfs2_init_fs_context, .parameters = ocfs2_param_spec, }; MODULE_ALIAS_FS("ocfs2"); static int ocfs2_check_set_options(struct super_block *sb, struct mount_options *options) { if (options->user_stack == 0) { u32 tmp; /* Ensure only one heartbeat mode */ tmp = options->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE); if (hweight32(tmp) != 1) { mlog(ML_ERROR, "Invalid heartbeat mount options\n"); return 0; } } if (options->mount_opt & OCFS2_MOUNT_USRQUOTA && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { mlog(ML_ERROR, "User quotas were requested, but this " "filesystem does not have the feature enabled.\n"); return 0; } if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { mlog(ML_ERROR, "Group quotas were requested, but this " "filesystem does not have the feature enabled.\n"); return 0; } if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL && !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) { mlog(ML_ERROR, "ACL support requested but extended attributes " "feature is not enabled\n"); return 0; } /* No ACL setting specified? Use XATTR feature... */ if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL | OCFS2_MOUNT_NO_POSIX_ACL))) { if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) options->mount_opt |= OCFS2_MOUNT_POSIX_ACL; else options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; } return 1; } static int ocfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; int opt; struct mount_options *mopt = fc->fs_private; bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE); trace_ocfs2_parse_options(is_remount, param->key); opt = fs_parse(fc, ocfs2_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_heartbeat: mopt->mount_opt |= result.uint_32; break; case Opt_barrier: if (result.uint_32) mopt->mount_opt |= OCFS2_MOUNT_BARRIER; else mopt->mount_opt &= ~OCFS2_MOUNT_BARRIER; break; case Opt_intr: if (result.negated) mopt->mount_opt |= OCFS2_MOUNT_NOINTR; else mopt->mount_opt &= ~OCFS2_MOUNT_NOINTR; break; case Opt_errors: mopt->mount_opt &= ~(OCFS2_MOUNT_ERRORS_CONT | OCFS2_MOUNT_ERRORS_ROFS | OCFS2_MOUNT_ERRORS_PANIC); mopt->mount_opt |= result.uint_32; break; case Opt_data: mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; mopt->mount_opt |= result.uint_32; break; case Opt_user_xattr: if (result.negated) mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR; else mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; break; case Opt_atime_quantum: mopt->atime_quantum = result.uint_32; break; case Opt_slot: if (result.uint_32) mopt->slot = (u16)result.uint_32; break; case Opt_commit: if (result.uint_32 == 0) mopt->commit_interval = HZ * JBD2_DEFAULT_MAX_COMMIT_AGE; else mopt->commit_interval = HZ * result.uint_32; break; case Opt_localalloc: if (result.int_32 >= 0) mopt->localalloc_opt = result.int_32; break; case Opt_localflocks: /* * Changing this during remount could race flock() requests, or * "unbalance" existing ones (e.g., a lock is taken in one mode * but dropped in the other). If users care enough to flip * locking modes during remount, we could add a "local" flag to * individual flock structures for proper tracking of state. */ if (!is_remount) mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; break; case Opt_stack: /* Check both that the option we were passed is of the right * length and that it is a proper string of the right length. */ if (strlen(param->string) != OCFS2_STACK_LABEL_LEN) { mlog(ML_ERROR, "Invalid cluster_stack option\n"); return -EINVAL; } memcpy(mopt->cluster_stack, param->string, OCFS2_STACK_LABEL_LEN); mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; /* * Open code the memcmp here as we don't have an osb to pass * to ocfs2_userspace_stack(). */ if (memcmp(mopt->cluster_stack, OCFS2_CLASSIC_CLUSTER_STACK, OCFS2_STACK_LABEL_LEN)) mopt->user_stack = 1; break; case Opt_inode64: mopt->mount_opt |= OCFS2_MOUNT_INODE64; break; case Opt_usrquota: mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; break; case Opt_grpquota: mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; break; case Opt_coherency: mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; mopt->mount_opt |= result.uint_32; break; case Opt_acl: if (result.negated) { mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; } else { mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; } break; case Opt_resv_level: if (is_remount) break; if (result.uint_32 >= OCFS2_MIN_RESV_LEVEL && result.uint_32 < OCFS2_MAX_RESV_LEVEL) mopt->resv_level = result.uint_32; break; case Opt_dir_resv_level: if (is_remount) break; if (result.uint_32 >= OCFS2_MIN_RESV_LEVEL && result.uint_32 < OCFS2_MAX_RESV_LEVEL) mopt->dir_resv_level = result.uint_32; break; case Opt_journal_async_commit: mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; break; default: return -EINVAL; } return 0; } static int ocfs2_show_options(struct seq_file *s, struct dentry *root) { struct ocfs2_super *osb = OCFS2_SB(root->d_sb); unsigned long opts = osb->s_mount_opt; unsigned int local_alloc_megs; if (opts & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL)) { seq_printf(s, ",_netdev"); if (opts & OCFS2_MOUNT_HB_LOCAL) seq_printf(s, ",%s", OCFS2_HB_LOCAL); else seq_printf(s, ",%s", OCFS2_HB_GLOBAL); } else seq_printf(s, ",%s", OCFS2_HB_NONE); if (opts & OCFS2_MOUNT_NOINTR) seq_printf(s, ",nointr"); if (opts & OCFS2_MOUNT_DATA_WRITEBACK) seq_printf(s, ",data=writeback"); else seq_printf(s, ",data=ordered"); if (opts & OCFS2_MOUNT_BARRIER) seq_printf(s, ",barrier=1"); if (opts & OCFS2_MOUNT_ERRORS_PANIC) seq_printf(s, ",errors=panic"); else if (opts & OCFS2_MOUNT_ERRORS_CONT) seq_printf(s, ",errors=continue"); else seq_printf(s, ",errors=remount-ro"); if (osb->preferred_slot != OCFS2_INVALID_SLOT) seq_printf(s, ",preferred_slot=%d", osb->preferred_slot); seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); if (osb->osb_commit_interval) seq_printf(s, ",commit=%u", (unsigned) (osb->osb_commit_interval / HZ)); local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); if (local_alloc_megs != ocfs2_la_default_mb(osb)) seq_printf(s, ",localalloc=%d", local_alloc_megs); if (opts & OCFS2_MOUNT_LOCALFLOCKS) seq_printf(s, ",localflocks,"); if (osb->osb_cluster_stack[0]) seq_show_option(s, "cluster_stack", osb->osb_cluster_stack); if (opts & OCFS2_MOUNT_USRQUOTA) seq_printf(s, ",usrquota"); if (opts & OCFS2_MOUNT_GRPQUOTA) seq_printf(s, ",grpquota"); if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED) seq_printf(s, ",coherency=buffered"); else seq_printf(s, ",coherency=full"); if (opts & OCFS2_MOUNT_NOUSERXATTR) seq_printf(s, ",nouser_xattr"); else seq_printf(s, ",user_xattr"); if (opts & OCFS2_MOUNT_INODE64) seq_printf(s, ",inode64"); if (opts & OCFS2_MOUNT_POSIX_ACL) seq_printf(s, ",acl"); else seq_printf(s, ",noacl"); if (osb->osb_resv_level != OCFS2_DEFAULT_RESV_LEVEL) seq_printf(s, ",resv_level=%d", osb->osb_resv_level); if (osb->osb_dir_resv_level != osb->osb_resv_level) seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) seq_printf(s, ",journal_async_commit"); return 0; } static int __init ocfs2_init(void) { int status; status = init_ocfs2_uptodate_cache(); if (status < 0) goto out1; status = ocfs2_initialize_mem_caches(); if (status < 0) goto out2; ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); ocfs2_set_locking_protocol(); register_quota_format(&ocfs2_quota_format); status = register_filesystem(&ocfs2_fs_type); if (!status) return 0; unregister_quota_format(&ocfs2_quota_format); debugfs_remove(ocfs2_debugfs_root); ocfs2_free_mem_caches(); out2: exit_ocfs2_uptodate_cache(); out1: mlog_errno(status); return status; } static void __exit ocfs2_exit(void) { unregister_quota_format(&ocfs2_quota_format); debugfs_remove(ocfs2_debugfs_root); ocfs2_free_mem_caches(); unregister_filesystem(&ocfs2_fs_type); exit_ocfs2_uptodate_cache(); } static void ocfs2_put_super(struct super_block *sb) { trace_ocfs2_put_super(sb); ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); } static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) { struct ocfs2_super *osb; u32 numbits, freebits; int status; struct ocfs2_dinode *bm_lock; struct buffer_head *bh = NULL; struct inode *inode = NULL; trace_ocfs2_statfs(dentry->d_sb, buf); osb = OCFS2_SB(dentry->d_sb); inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); if (!inode) { mlog(ML_ERROR, "failed to get bitmap inode\n"); status = -EIO; goto bail; } status = ocfs2_inode_lock(inode, &bh, 0); if (status < 0) { mlog_errno(status); goto bail; } bm_lock = (struct ocfs2_dinode *) bh->b_data; numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total); freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used); buf->f_type = OCFS2_SUPER_MAGIC; buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_namelen = OCFS2_MAX_FILENAME_LEN; buf->f_blocks = ((sector_t) numbits) * (osb->s_clustersize >> osb->sb->s_blocksize_bits); buf->f_bfree = ((sector_t) freebits) * (osb->s_clustersize >> osb->sb->s_blocksize_bits); buf->f_bavail = buf->f_bfree; buf->f_files = numbits; buf->f_ffree = freebits; buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL; buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN, OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL; brelse(bh); ocfs2_inode_unlock(inode, 0); status = 0; bail: iput(inode); if (status) mlog_errno(status); return status; } static void ocfs2_inode_init_once(void *data) { struct ocfs2_inode_info *oi = data; oi->ip_flags = 0; oi->ip_open_count = 0; spin_lock_init(&oi->ip_lock); ocfs2_extent_map_init(&oi->vfs_inode); INIT_LIST_HEAD(&oi->ip_io_markers); INIT_LIST_HEAD(&oi->ip_unwritten_list); oi->ip_dir_start_lookup = 0; init_rwsem(&oi->ip_alloc_sem); init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); oi->ip_blkno = 0ULL; oi->ip_clusters = 0; oi->ip_next_orphan = NULL; ocfs2_resv_init_once(&oi->ip_la_data_resv); ocfs2_lock_res_init_once(&oi->ip_rw_lockres); ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), &ocfs2_inode_caching_ops); inode_init_once(&oi->vfs_inode); } static int ocfs2_initialize_mem_caches(void) { ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache", sizeof(struct ocfs2_inode_info), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), ocfs2_inode_init_once); ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", sizeof(struct ocfs2_dquot), 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, NULL); ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache", sizeof(struct ocfs2_quota_chunk), 0, SLAB_RECLAIM_ACCOUNT, NULL); if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || !ocfs2_qf_chunk_cachep) { kmem_cache_destroy(ocfs2_inode_cachep); kmem_cache_destroy(ocfs2_dquot_cachep); kmem_cache_destroy(ocfs2_qf_chunk_cachep); return -ENOMEM; } return 0; } static void ocfs2_free_mem_caches(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ocfs2_inode_cachep); ocfs2_inode_cachep = NULL; kmem_cache_destroy(ocfs2_dquot_cachep); ocfs2_dquot_cachep = NULL; kmem_cache_destroy(ocfs2_qf_chunk_cachep); ocfs2_qf_chunk_cachep = NULL; } static int ocfs2_get_sector(struct super_block *sb, struct buffer_head **bh, int block, int sect_size) { if (!sb_set_blocksize(sb, sect_size)) { mlog(ML_ERROR, "unable to set blocksize\n"); return -EIO; } *bh = sb_getblk(sb, block); if (!*bh) { mlog_errno(-ENOMEM); return -ENOMEM; } lock_buffer(*bh); if (!buffer_dirty(*bh)) clear_buffer_uptodate(*bh); unlock_buffer(*bh); if (bh_read(*bh, 0) < 0) { mlog_errno(-EIO); brelse(*bh); *bh = NULL; return -EIO; } return 0; } static int ocfs2_mount_volume(struct super_block *sb) { int status = 0; struct ocfs2_super *osb = OCFS2_SB(sb); if (ocfs2_is_hard_readonly(osb)) goto out; mutex_init(&osb->obs_trim_fs_mutex); status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); if (status == -EBADR && ocfs2_userspace_stack(osb)) mlog(ML_ERROR, "couldn't mount because cluster name on" " disk does not match the running cluster name.\n"); goto out; } status = ocfs2_super_lock(osb, 1); if (status < 0) { mlog_errno(status); goto out_dlm; } /* This will load up the node map and add ourselves to it. */ status = ocfs2_find_slot(osb); if (status < 0) { mlog_errno(status); goto out_super_lock; } /* load all node-local system inodes */ status = ocfs2_init_local_system_inodes(osb); if (status < 0) { mlog_errno(status); goto out_super_lock; } status = ocfs2_check_volume(osb); if (status < 0) { mlog_errno(status); goto out_system_inodes; } status = ocfs2_truncate_log_init(osb); if (status < 0) { mlog_errno(status); goto out_check_volume; } ocfs2_super_unlock(osb, 1); return 0; out_check_volume: ocfs2_free_replay_slots(osb); out_system_inodes: if (osb->local_alloc_state == OCFS2_LA_ENABLED) ocfs2_shutdown_local_alloc(osb); ocfs2_release_system_inodes(osb); /* before journal shutdown, we should release slot_info */ ocfs2_free_slot_info(osb); ocfs2_journal_shutdown(osb); out_super_lock: ocfs2_super_unlock(osb, 1); out_dlm: ocfs2_dlm_shutdown(osb, 0); out: return status; } static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) { int tmp, hangup_needed = 0; struct ocfs2_super *osb = NULL; char nodestr[12]; trace_ocfs2_dismount_volume(sb); BUG_ON(!sb); osb = OCFS2_SB(sb); BUG_ON(!osb); /* Remove file check sysfs related directories/files, * and wait for the pending file check operations */ ocfs2_filecheck_remove_sysfs(osb); kset_unregister(osb->osb_dev_kset); /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); /* Stop quota recovery so that we can disable quotas */ ocfs2_recovery_disable_quota(osb); ocfs2_disable_quotas(osb); /* All dquots should be freed by now */ WARN_ON(!llist_empty(&osb->dquot_drop_list)); /* Wait for worker to be done with the work structure in osb */ cancel_work_sync(&osb->dquot_drop_work); ocfs2_shutdown_local_alloc(osb); ocfs2_truncate_log_shutdown(osb); /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); ocfs2_sync_blockdev(sb); ocfs2_purge_refcount_trees(osb); /* No cluster connection means we've failed during mount, so skip * all the steps which depended on that to complete. */ if (osb->cconn) { tmp = ocfs2_super_lock(osb, 1); if (tmp < 0) { mlog_errno(tmp); return; } } if (osb->slot_num != OCFS2_INVALID_SLOT) ocfs2_put_slot(osb); if (osb->cconn) ocfs2_super_unlock(osb, 1); ocfs2_release_system_inodes(osb); ocfs2_journal_shutdown(osb); /* * If we're dismounting due to mount error, mount.ocfs2 will clean * up heartbeat. If we're a local mount, there is no heartbeat. * If we failed before we got a uuid_str yet, we can't stop * heartbeat. Otherwise, do it. */ if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str && !ocfs2_is_hard_readonly(osb)) hangup_needed = 1; ocfs2_dlm_shutdown(osb, hangup_needed); ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); debugfs_remove_recursive(osb->osb_debug_root); if (hangup_needed) ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); if (ocfs2_mount_local(osb)) snprintf(nodestr, sizeof(nodestr), "local"); else snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", osb->dev_str, nodestr); ocfs2_delete_osb(osb); kfree(osb); sb->s_dev = 0; sb->s_fs_info = NULL; } static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid, unsigned uuid_bytes) { int i, ret; char *ptr; BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN); osb->uuid_str = kzalloc(OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); if (osb->uuid_str == NULL) return -ENOMEM; for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) { /* print with null */ ret = snprintf(ptr, 3, "%02X", uuid[i]); if (ret != 2) /* drop super cleans up */ return -EINVAL; /* then only advance past the last char */ ptr += 2; } return 0; } /* Make sure entire volume is addressable by our journal. Requires osb_clusters_at_boot to be valid and for the journal to have been initialized by ocfs2_journal_init(). */ static int ocfs2_journal_addressable(struct ocfs2_super *osb) { int status = 0; u64 max_block = ocfs2_clusters_to_blocks(osb->sb, osb->osb_clusters_at_boot) - 1; /* 32-bit block number is always OK. */ if (max_block <= (u32)~0ULL) goto out; /* Volume is "huge", so see if our journal is new enough to support it. */ if (!(OCFS2_HAS_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_COMPAT_JBD2_SB) && jbd2_journal_check_used_features(osb->journal->j_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT))) { mlog(ML_ERROR, "The journal cannot address the entire volume. " "Enable the 'block64' journal option with tunefs.ocfs2"); status = -EFBIG; goto out; } out: return status; } static int ocfs2_initialize_super(struct super_block *sb, struct buffer_head *bh, int sector_size, struct ocfs2_blockcheck_stats *stats) { int status; int i, cbits, bbits; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; struct inode *inode = NULL; struct ocfs2_super *osb; u64 total_blocks; osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL); if (!osb) { status = -ENOMEM; mlog_errno(status); goto out; } sb->s_fs_info = osb; sb->s_op = &ocfs2_sops; set_default_d_op(sb, &ocfs2_dentry_ops); sb->s_export_op = &ocfs2_export_ops; sb->s_qcop = &dquot_quotactl_sysfile_ops; sb->dq_op = &ocfs2_quota_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; sb->s_xattr = ocfs2_xattr_handlers; sb->s_time_gran = 1; sb->s_flags |= SB_NOATIME; /* this is needed to support O_LARGEFILE */ cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); super_set_uuid(sb, di->id2.i_super.s_uuid, sizeof(di->id2.i_super.s_uuid)); osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; for (i = 0; i < 3; i++) osb->osb_dx_seed[i] = le32_to_cpu(di->id2.i_super.s_dx_seed[i]); osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); osb->sb = sb; osb->s_sectsize_bits = blksize_bits(sector_size); BUG_ON(!osb->s_sectsize_bits); spin_lock_init(&osb->dc_task_lock); init_waitqueue_head(&osb->dc_event); osb->dc_work_sequence = 0; osb->dc_wake_sequence = 0; INIT_LIST_HEAD(&osb->blocked_lock_list); osb->blocked_lock_count = 0; spin_lock_init(&osb->osb_lock); spin_lock_init(&osb->osb_xattr_lock); ocfs2_init_steal_slots(osb); mutex_init(&osb->system_file_mutex); atomic_set(&osb->alloc_stats.moves, 0); atomic_set(&osb->alloc_stats.local_data, 0); atomic_set(&osb->alloc_stats.bitmap_data, 0); atomic_set(&osb->alloc_stats.bg_allocs, 0); atomic_set(&osb->alloc_stats.bg_extends, 0); /* Copy the blockcheck stats from the superblock probe */ osb->osb_ecc_stats = *stats; ocfs2_init_node_maps(osb); snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { mlog(ML_ERROR, "Invalid number of node slots (%u)\n", osb->max_slots); status = -EINVAL; goto out; } ocfs2_orphan_scan_init(osb); status = ocfs2_recovery_init(osb); if (status) { mlog(ML_ERROR, "Unable to initialize recovery state\n"); mlog_errno(status); goto out; } init_waitqueue_head(&osb->checkpoint_event); osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; osb->slot_num = OCFS2_INVALID_SLOT; osb->s_xattr_inline_size = le16_to_cpu( di->id2.i_super.s_xattr_inline_size); osb->local_alloc_state = OCFS2_LA_UNUSED; osb->local_alloc_bh = NULL; INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker); init_waitqueue_head(&osb->osb_mount_event); ocfs2_resmap_init(osb, &osb->osb_la_resmap); osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); if (!osb->vol_label) { mlog(ML_ERROR, "unable to alloc vol label\n"); status = -ENOMEM; goto out_recovery_map; } osb->slot_recovery_generations = kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations), GFP_KERNEL); if (!osb->slot_recovery_generations) { status = -ENOMEM; mlog_errno(status); goto out_vol_label; } init_waitqueue_head(&osb->osb_wipe_event); osb->osb_orphan_wipes = kcalloc(osb->max_slots, sizeof(*osb->osb_orphan_wipes), GFP_KERNEL); if (!osb->osb_orphan_wipes) { status = -ENOMEM; mlog_errno(status); goto out_slot_recovery_gen; } osb->osb_rf_lock_tree = RB_ROOT; osb->s_feature_compat = le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); osb->s_feature_ro_compat = le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat); osb->s_feature_incompat = le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat); if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) { mlog(ML_ERROR, "couldn't mount because of unsupported " "optional features (%x).\n", i); status = -EINVAL; goto out_orphan_wipes; } if (!sb_rdonly(osb->sb) && (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { mlog(ML_ERROR, "couldn't mount RDWR because of " "unsupported optional features (%x).\n", i); status = -EINVAL; goto out_orphan_wipes; } if (ocfs2_clusterinfo_valid(osb)) { /* * ci_stack and ci_cluster in ocfs2_cluster_info may not be null * terminated, so make sure no overflow happens here by using * memcpy. Destination strings will always be null terminated * because osb is allocated using kzalloc. */ osb->osb_stackflags = OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; memcpy(osb->osb_cluster_stack, OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, OCFS2_STACK_LABEL_LEN); if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { mlog(ML_ERROR, "couldn't mount because of an invalid " "cluster stack label (%s) \n", osb->osb_cluster_stack); status = -EINVAL; goto out_orphan_wipes; } memcpy(osb->osb_cluster_name, OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, OCFS2_CLUSTER_NAME_LEN); } else { /* The empty string is identical with classic tools that * don't know about s_cluster_info. */ osb->osb_cluster_stack[0] = '\0'; } get_random_bytes(&osb->s_next_generation, sizeof(u32)); /* * FIXME * This should be done in ocfs2_journal_init(), but any inode * writes back operation will cause the filesystem to crash. */ status = ocfs2_journal_alloc(osb); if (status < 0) goto out_orphan_wipes; INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); init_llist_head(&osb->dquot_drop_list); /* get some pseudo constants for clustersize bits */ osb->s_clustersize_bits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); osb->s_clustersize = 1 << osb->s_clustersize_bits; if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE || osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) { mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", osb->s_clustersize); status = -EINVAL; goto out_journal; } total_blocks = ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters)); status = generic_check_addressable(osb->sb->s_blocksize_bits, total_blocks); if (status) { mlog(ML_ERROR, "Volume too large " "to mount safely on this system"); status = -EFBIG; goto out_journal; } if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, sizeof(di->id2.i_super.s_uuid))) { mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); status = -ENOMEM; goto out_journal; } strscpy(osb->vol_label, di->id2.i_super.s_label, OCFS2_MAX_VOL_LABEL_LEN); osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); osb->first_cluster_group_blkno = le64_to_cpu(di->id2.i_super.s_first_cluster_group); osb->fs_generation = le32_to_cpu(di->i_fs_generation); osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash); trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str, (unsigned long long)osb->root_blkno, (unsigned long long)osb->system_dir_blkno, osb->s_clustersize_bits); osb->osb_dlm_debug = ocfs2_new_dlm_debug(); if (!osb->osb_dlm_debug) { status = -ENOMEM; mlog_errno(status); goto out_uuid_str; } atomic_set(&osb->vol_state, VOLUME_INIT); /* load root, system_dir, and all global system inodes */ status = ocfs2_init_global_system_inodes(osb); if (status < 0) { mlog_errno(status); goto out_dlm_out; } /* * global bitmap */ inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); if (!inode) { status = -EINVAL; mlog_errno(status); goto out_system_inodes; } osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters; iput(inode); osb->bitmap_cpg = ocfs2_group_bitmap_size(sb, 0, osb->s_feature_incompat) * 8; status = ocfs2_init_slot_info(osb); if (status < 0) { mlog_errno(status); goto out_system_inodes; } osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM); if (!osb->ocfs2_wq) { status = -ENOMEM; mlog_errno(status); goto out_slot_info; } return status; out_slot_info: ocfs2_free_slot_info(osb); out_system_inodes: ocfs2_release_system_inodes(osb); out_dlm_out: ocfs2_put_dlm_debug(osb->osb_dlm_debug); out_uuid_str: kfree(osb->uuid_str); out_journal: kfree(osb->journal); out_orphan_wipes: kfree(osb->osb_orphan_wipes); out_slot_recovery_gen: kfree(osb->slot_recovery_generations); out_vol_label: kfree(osb->vol_label); out_recovery_map: kfree(osb->recovery_map); out: kfree(osb); sb->s_fs_info = NULL; return status; } /* * will return: -EAGAIN if it is ok to keep searching for superblocks * -EINVAL if there is a bad superblock * 0 on success */ static int ocfs2_verify_volume(struct ocfs2_dinode *di, struct buffer_head *bh, u32 blksz, struct ocfs2_blockcheck_stats *stats) { int status = -EAGAIN; u32 blksz_bits; if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { /* We have to do a raw check of the feature here */ if (le32_to_cpu(di->id2.i_super.s_feature_incompat) & OCFS2_FEATURE_INCOMPAT_META_ECC) { status = ocfs2_block_check_validate(bh->b_data, bh->b_size, &di->i_check, stats); if (status) goto out; } status = -EINVAL; /* Acceptable block sizes are 512 bytes, 1K, 2K and 4K. */ blksz_bits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); if (blksz_bits < 9 || blksz_bits > 12) { mlog(ML_ERROR, "found superblock with incorrect block " "size bits: found %u, should be 9, 10, 11, or 12\n", blksz_bits); } else if ((1 << blksz_bits) != blksz) { mlog(ML_ERROR, "found superblock with incorrect block " "size: found %u, should be %u\n", 1 << blksz_bits, blksz); } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != OCFS2_MAJOR_REV_LEVEL || le16_to_cpu(di->id2.i_super.s_minor_rev_level) != OCFS2_MINOR_REV_LEVEL) { mlog(ML_ERROR, "found superblock with bad version: " "found %u.%u, should be %u.%u\n", le16_to_cpu(di->id2.i_super.s_major_rev_level), le16_to_cpu(di->id2.i_super.s_minor_rev_level), OCFS2_MAJOR_REV_LEVEL, OCFS2_MINOR_REV_LEVEL); } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) { mlog(ML_ERROR, "bad block number on superblock: " "found %llu, should be %llu\n", (unsigned long long)le64_to_cpu(di->i_blkno), (unsigned long long)bh->b_blocknr); } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { mlog(ML_ERROR, "bad cluster size bit found: %u\n", le32_to_cpu(di->id2.i_super.s_clustersize_bits)); } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) { mlog(ML_ERROR, "bad root_blkno: 0\n"); } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) { mlog(ML_ERROR, "bad system_dir_blkno: 0\n"); } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) { mlog(ML_ERROR, "Superblock slots found greater than file system " "maximum: found %u, max %u\n", le16_to_cpu(di->id2.i_super.s_max_slots), OCFS2_MAX_SLOTS); } else { /* found it! */ status = 0; } } out: if (status && status != -EAGAIN) mlog_errno(status); return status; } static int ocfs2_check_volume(struct ocfs2_super *osb) { int status; int dirty; int local; struct ocfs2_dinode *local_alloc = NULL; /* only used if we * recover * ourselves. */ /* Init our journal object. */ status = ocfs2_journal_init(osb, &dirty); if (status < 0) { mlog(ML_ERROR, "Could not initialize journal!\n"); goto finally; } /* Now that journal has been initialized, check to make sure entire volume is addressable. */ status = ocfs2_journal_addressable(osb); if (status) goto finally; /* If the journal was unmounted cleanly then we don't want to * recover anything. Otherwise, journal_load will do that * dirty work for us :) */ if (!dirty) { status = ocfs2_journal_wipe(osb->journal, 0); if (status < 0) { mlog_errno(status); goto finally; } } else { printk(KERN_NOTICE "ocfs2: File system on device (%s) was not " "unmounted cleanly, recovering it.\n", osb->dev_str); } local = ocfs2_mount_local(osb); /* will play back anything left in the journal. */ status = ocfs2_journal_load(osb->journal, local, dirty); if (status < 0) { mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status); goto finally; } if (osb->s_mount_opt & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) jbd2_journal_set_features(osb->journal->j_journal, JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); else jbd2_journal_clear_features(osb->journal->j_journal, JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); if (dirty) { /* recover my local alloc if we didn't unmount cleanly. */ status = ocfs2_begin_local_alloc_recovery(osb, osb->slot_num, &local_alloc); if (status < 0) { mlog_errno(status); goto finally; } /* we complete the recovery process after we've marked * ourselves as mounted. */ } status = ocfs2_load_local_alloc(osb); if (status < 0) { mlog_errno(status); goto finally; } if (dirty) { /* Recovery will be completed after we've mounted the * rest of the volume. */ osb->local_alloc_copy = local_alloc; local_alloc = NULL; } /* go through each journal, trylock it and if you get the * lock, and it's marked as dirty, set the bit in the recover * map and launch a recovery thread for it. */ status = ocfs2_mark_dead_nodes(osb); if (status < 0) { mlog_errno(status); goto finally; } status = ocfs2_compute_replay_slots(osb); if (status < 0) mlog_errno(status); finally: kfree(local_alloc); if (status) mlog_errno(status); return status; } /* * The routine gets called from dismount or close whenever a dismount on * volume is requested and the osb open count becomes 1. * It will remove the osb from the global list and also free up all the * initialized resources and fileobject. */ static void ocfs2_delete_osb(struct ocfs2_super *osb) { /* This function assumes that the caller has the main osb resource */ /* ocfs2_initializer_super have already created this workqueue */ if (osb->ocfs2_wq) destroy_workqueue(osb->ocfs2_wq); ocfs2_free_slot_info(osb); kfree(osb->osb_orphan_wipes); kfree(osb->slot_recovery_generations); /* FIXME * This belongs in journal shutdown, but because we have to * allocate osb->journal at the middle of ocfs2_initialize_super(), * we free it here. */ kfree(osb->journal); kfree(osb->local_alloc_copy); kfree(osb->uuid_str); kfree(osb->vol_label); ocfs2_put_dlm_debug(osb->osb_dlm_debug); memset(osb, 0, sizeof(struct ocfs2_super)); } /* Depending on the mount option passed, perform one of the following: * Put OCFS2 into a readonly state (default) * Return EIO so that only the process errs * Fix the error as if fsck.ocfs2 -y * panic */ static int ocfs2_handle_error(struct super_block *sb) { struct ocfs2_super *osb = OCFS2_SB(sb); int rv = 0; ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); pr_crit("On-disk corruption discovered. " "Please run fsck.ocfs2 once the filesystem is unmounted.\n"); if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) { panic("OCFS2: (device %s): panic forced after error\n", sb->s_id); } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) { pr_crit("OCFS2: Returning error to the calling process.\n"); rv = -EIO; } else { /* default option */ rv = -EROFS; if (sb_rdonly(sb) && ocfs2_emergency_state(osb)) return rv; pr_crit("OCFS2: File system is now read-only.\n"); sb->s_flags |= SB_RDONLY; ocfs2_set_ro_flag(osb, 0); } return rv; } int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; /* Not using mlog here because we want to show the actual * function the error came from. */ printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); return ocfs2_handle_error(sb); } /* Handle critical errors. This is intentionally more drastic than * ocfs2_handle_error, so we only use for things like journal errors, * etc. */ void __ocfs2_abort(struct super_block *sb, const char *function, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV", sb->s_id, function, &vaf); va_end(args); /* We don't have the cluster support yet to go straight to * hard readonly in here. Until then, we want to keep * ocfs2_abort() so that we can at least mark critical * errors. * * TODO: This should abort the journal and alert other nodes * that our slot needs recovery. */ /* Force a panic(). This stinks, but it's better than letting * things continue without having a proper hard readonly * here. */ if (!ocfs2_mount_local(OCFS2_SB(sb))) OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; ocfs2_handle_error(sb); } /* * Void signal blockers, because in-kernel sigprocmask() only fails * when SIG_* is wrong. */ void ocfs2_block_signals(sigset_t *oldset) { int rc; sigset_t blocked; sigfillset(&blocked); rc = sigprocmask(SIG_BLOCK, &blocked, oldset); BUG_ON(rc); } void ocfs2_unblock_signals(sigset_t *oldset) { int rc = sigprocmask(SIG_SETMASK, oldset, NULL); BUG_ON(rc); } module_init(ocfs2_init); module_exit(ocfs2_exit);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 // SPDX-License-Identifier: GPL-2.0-only /* * sys.c * * OCFS2 cluster sysfs interface * * Copyright (C) 2005 Oracle. All rights reserved. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/kobject.h> #include <linux/sysfs.h> #include <linux/fs.h> #include "ocfs2_nodemanager.h" #include "masklog.h" #include "sys.h" static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); } static struct kobj_attribute attr_version = __ATTR(interface_revision, S_IRUGO, version_show, NULL); static struct attribute *o2cb_attrs[] = { &attr_version.attr, NULL, }; static struct attribute_group o2cb_attr_group = { .attrs = o2cb_attrs, }; static struct kset *o2cb_kset; void o2cb_sys_shutdown(void) { mlog_sys_shutdown(); kset_unregister(o2cb_kset); } int o2cb_sys_init(void) { int ret; o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj); if (!o2cb_kset) return -ENOMEM; ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); if (ret) goto error; ret = mlog_sys_init(o2cb_kset); if (ret) goto error; return 0; error: kset_unregister(o2cb_kset); return ret; }
30 30 5327 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SOCKET_H #define _LINUX_SOCKET_H #include <asm/socket.h> /* arch-dependent defines */ #include <linux/sockios.h> /* the SIOCxxx I/O controls */ #include <linux/uio.h> /* iovec support */ #include <linux/types.h> /* pid_t */ #include <linux/compiler.h> /* __user */ #include <uapi/linux/socket.h> struct file; struct pid; struct cred; struct socket; struct sock; struct sk_buff; struct proto_accept_arg; #define __sockaddr_check_size(size) \ BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) #ifdef CONFIG_PROC_FS struct seq_file; extern void socket_seq_show(struct seq_file *seq); #endif typedef __kernel_sa_family_t sa_family_t; /* * 1003.1g requires sa_family_t and that sa_data is char. */ /* Deprecated for in-kernel use. Use struct sockaddr_unsized instead. */ struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ char sa_data[14]; /* 14 bytes of protocol address */ }; /** * struct sockaddr_unsized - Unspecified size sockaddr for callbacks * @sa_family: Address family (AF_UNIX, AF_INET, AF_INET6, etc.) * @sa_data: Flexible array for address data * * This structure is designed for callback interfaces where the * total size is known via the sockaddr_len parameter. Unlike struct * sockaddr which has a fixed 14-byte sa_data limit or struct * sockaddr_storage which has a fixed 128-byte sa_data limit, this * structure can accommodate addresses of any size, but must be used * carefully. */ struct sockaddr_unsized { __kernel_sa_family_t sa_family; /* address family, AF_xxx */ char sa_data[]; /* flexible address data */ }; struct linger { int l_onoff; /* Linger active */ int l_linger; /* How long to linger for */ }; #define sockaddr_storage __kernel_sockaddr_storage /* * As we do 4.4BSD message passing we use a 4.4BSD message passing * system, not 4.3. Thus msg_accrights(len) are now missing. They * belong in an obscure libc emulation or the bin. */ struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ int msg_inq; /* output, data left in socket */ struct iov_iter msg_iter; /* data */ /* * Ancillary data. msg_control_user is the user buffer used for the * recv* side when msg_control_is_user is set, msg_control is the kernel * buffer used for all other cases. */ union { void *msg_control; void __user *msg_control_user; }; bool msg_control_is_user : 1; bool msg_get_inq : 1;/* return INQ after receive */ unsigned int msg_flags; /* flags on received message */ __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ struct ubuf_info *msg_ubuf; int (*sg_from_iter)(struct sk_buff *skb, struct iov_iter *from, size_t length); }; struct user_msghdr { void __user *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ struct iovec __user *msg_iov; /* scatter/gather array */ __kernel_size_t msg_iovlen; /* # elements in msg_iov */ void __user *msg_control; /* ancillary data */ __kernel_size_t msg_controllen; /* ancillary data buffer length */ unsigned int msg_flags; /* flags on received message */ }; /* For recvmmsg/sendmmsg */ struct mmsghdr { struct user_msghdr msg_hdr; unsigned int msg_len; }; /* * POSIX 1003.1g - ancillary data object information * Ancillary data consists of a sequence of pairs of * (cmsghdr, cmsg_data[]) */ struct cmsghdr { __kernel_size_t cmsg_len; /* data byte count, including hdr */ int cmsg_level; /* originating protocol */ int cmsg_type; /* protocol-specific type */ }; /* * Ancillary data object information MACROS * Table 5-14 of POSIX 1003.1g */ #define __CMSG_NXTHDR(ctl, len, cmsg) __cmsg_nxthdr((ctl),(len),(cmsg)) #define CMSG_NXTHDR(mhdr, cmsg) cmsg_nxthdr((mhdr), (cmsg)) #define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) ) #define CMSG_DATA(cmsg) \ ((void *)(cmsg) + sizeof(struct cmsghdr)) #define CMSG_USER_DATA(cmsg) \ ((void __user *)(cmsg) + sizeof(struct cmsghdr)) #define CMSG_SPACE(len) (sizeof(struct cmsghdr) + CMSG_ALIGN(len)) #define CMSG_LEN(len) (sizeof(struct cmsghdr) + (len)) #define __CMSG_FIRSTHDR(ctl,len) ((len) >= sizeof(struct cmsghdr) ? \ (struct cmsghdr *)(ctl) : \ (struct cmsghdr *)NULL) #define CMSG_FIRSTHDR(msg) __CMSG_FIRSTHDR((msg)->msg_control, (msg)->msg_controllen) #define CMSG_OK(mhdr, cmsg) ((cmsg)->cmsg_len >= sizeof(struct cmsghdr) && \ (cmsg)->cmsg_len <= (unsigned long) \ ((mhdr)->msg_controllen - \ ((char *)(cmsg) - (char *)(mhdr)->msg_control))) #define for_each_cmsghdr(cmsg, msg) \ for (cmsg = CMSG_FIRSTHDR(msg); \ cmsg; \ cmsg = CMSG_NXTHDR(msg, cmsg)) /* * Get the next cmsg header * * PLEASE, do not touch this function. If you think, that it is * incorrect, grep kernel sources and think about consequences * before trying to improve it. * * Now it always returns valid, not truncated ancillary object * HEADER. But caller still MUST check, that cmsg->cmsg_len is * inside range, given by msg->msg_controllen before using * ancillary object DATA. --ANK (980731) */ static inline struct cmsghdr * __cmsg_nxthdr(void *__ctl, __kernel_size_t __size, struct cmsghdr *__cmsg) { struct cmsghdr * __ptr; __ptr = (struct cmsghdr*)(((unsigned char *) __cmsg) + CMSG_ALIGN(__cmsg->cmsg_len)); if ((unsigned long)((char*)(__ptr+1) - (char *) __ctl) > __size) return (struct cmsghdr *)0; return __ptr; } static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr *__cmsg) { return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } static inline size_t msg_data_left(const struct msghdr *msg) { return iov_iter_count(&msg->msg_iter); } /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* rw: access rights (array of int) */ #define SCM_CREDENTIALS 0x02 /* rw: struct ucred */ #define SCM_SECURITY 0x03 /* rw: security label */ #define SCM_PIDFD 0x04 /* ro: pidfd (int) */ struct ucred { __u32 pid; __u32 uid; __u32 gid; }; /* Supported address families. */ #define AF_UNSPEC 0 #define AF_UNIX 1 /* Unix domain sockets */ #define AF_LOCAL 1 /* POSIX name for AF_UNIX */ #define AF_INET 2 /* Internet IP Protocol */ #define AF_AX25 3 /* Amateur Radio AX.25 */ #define AF_IPX 4 /* Novell IPX */ #define AF_APPLETALK 5 /* AppleTalk DDP */ #define AF_NETROM 6 /* Amateur Radio NET/ROM */ #define AF_BRIDGE 7 /* Multiprotocol bridge */ #define AF_ATMPVC 8 /* ATM PVCs */ #define AF_X25 9 /* Reserved for X.25 project */ #define AF_INET6 10 /* IP version 6 */ #define AF_ROSE 11 /* Amateur Radio X.25 PLP */ #define AF_DECnet 12 /* Reserved for DECnet project */ #define AF_NETBEUI 13 /* Reserved for 802.2LLC project*/ #define AF_SECURITY 14 /* Security callback pseudo AF */ #define AF_KEY 15 /* PF_KEY key management API */ #define AF_NETLINK 16 #define AF_ROUTE AF_NETLINK /* Alias to emulate 4.4BSD */ #define AF_PACKET 17 /* Packet family */ #define AF_ASH 18 /* Ash */ #define AF_ECONET 19 /* Acorn Econet */ #define AF_ATMSVC 20 /* ATM SVCs */ #define AF_RDS 21 /* RDS sockets */ #define AF_SNA 22 /* Linux SNA Project (nutters!) */ #define AF_IRDA 23 /* IRDA sockets */ #define AF_PPPOX 24 /* PPPoX sockets */ #define AF_WANPIPE 25 /* Wanpipe API Sockets */ #define AF_LLC 26 /* Linux LLC */ #define AF_IB 27 /* Native InfiniBand address */ #define AF_MPLS 28 /* MPLS */ #define AF_CAN 29 /* Controller Area Network */ #define AF_TIPC 30 /* TIPC sockets */ #define AF_BLUETOOTH 31 /* Bluetooth sockets */ #define AF_IUCV 32 /* IUCV sockets */ #define AF_RXRPC 33 /* RxRPC sockets */ #define AF_ISDN 34 /* mISDN sockets */ #define AF_PHONET 35 /* Phonet sockets */ #define AF_IEEE802154 36 /* IEEE802154 sockets */ #define AF_CAIF 37 /* CAIF sockets */ #define AF_ALG 38 /* Algorithm sockets */ #define AF_NFC 39 /* NFC sockets */ #define AF_VSOCK 40 /* vSockets */ #define AF_KCM 41 /* Kernel Connection Multiplexor*/ #define AF_QIPCRTR 42 /* Qualcomm IPC Router */ #define AF_SMC 43 /* smc sockets: reserve number for * PF_SMC protocol family that * reuses AF_INET address family */ #define AF_XDP 44 /* XDP sockets */ #define AF_MCTP 45 /* Management component * transport protocol */ #define AF_MAX 46 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC #define PF_UNIX AF_UNIX #define PF_LOCAL AF_LOCAL #define PF_INET AF_INET #define PF_AX25 AF_AX25 #define PF_IPX AF_IPX #define PF_APPLETALK AF_APPLETALK #define PF_NETROM AF_NETROM #define PF_BRIDGE AF_BRIDGE #define PF_ATMPVC AF_ATMPVC #define PF_X25 AF_X25 #define PF_INET6 AF_INET6 #define PF_ROSE AF_ROSE #define PF_DECnet AF_DECnet #define PF_NETBEUI AF_NETBEUI #define PF_SECURITY AF_SECURITY #define PF_KEY AF_KEY #define PF_NETLINK AF_NETLINK #define PF_ROUTE AF_ROUTE #define PF_PACKET AF_PACKET #define PF_ASH AF_ASH #define PF_ECONET AF_ECONET #define PF_ATMSVC AF_ATMSVC #define PF_RDS AF_RDS #define PF_SNA AF_SNA #define PF_IRDA AF_IRDA #define PF_PPPOX AF_PPPOX #define PF_WANPIPE AF_WANPIPE #define PF_LLC AF_LLC #define PF_IB AF_IB #define PF_MPLS AF_MPLS #define PF_CAN AF_CAN #define PF_TIPC AF_TIPC #define PF_BLUETOOTH AF_BLUETOOTH #define PF_IUCV AF_IUCV #define PF_RXRPC AF_RXRPC #define PF_ISDN AF_ISDN #define PF_PHONET AF_PHONET #define PF_IEEE802154 AF_IEEE802154 #define PF_CAIF AF_CAIF #define PF_ALG AF_ALG #define PF_NFC AF_NFC #define PF_VSOCK AF_VSOCK #define PF_KCM AF_KCM #define PF_QIPCRTR AF_QIPCRTR #define PF_SMC AF_SMC #define PF_XDP AF_XDP #define PF_MCTP AF_MCTP #define PF_MAX AF_MAX /* Maximum queue length specifiable by listen. */ #define SOMAXCONN 4096 /* Flags we can use with send/ and recv. Added those for 1003.1g not all are supported yet */ #define MSG_OOB 1 #define MSG_PEEK 2 #define MSG_DONTROUTE 4 #define MSG_TRYHARD 4 /* Synonym for MSG_DONTROUTE for DECnet */ #define MSG_CTRUNC 8 #define MSG_PROBE 0x10 /* Do not send. Only probe path f.e. for MTU */ #define MSG_TRUNC 0x20 #define MSG_DONTWAIT 0x40 /* Nonblocking io */ #define MSG_EOR 0x80 /* End of record */ #define MSG_WAITALL 0x100 /* Wait for a full request */ #define MSG_FIN 0x200 #define MSG_SYN 0x400 #define MSG_CONFIRM 0x800 /* Confirm path validity */ #define MSG_RST 0x1000 #define MSG_ERRQUEUE 0x2000 /* Fetch message from error queue */ #define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ #define MSG_MORE 0x8000 /* Sender will send more */ #define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ #define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */ #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN #define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ #define MSG_SENDPAGE_DECRYPTED 0x100000 /* sendpage() internal : page may carry * plain text and require encryption */ #define MSG_SOCK_DEVMEM 0x2000000 /* Receive devmem skbs as cmsg */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through SCM_RIGHTS */ #if defined(CONFIG_COMPAT) #define MSG_CMSG_COMPAT 0x80000000 /* This message needs 32 bit fixups */ #else #define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ #endif /* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */ #define MSG_INTERNAL_SENDMSG_FLAGS \ (MSG_SPLICE_PAGES | MSG_SENDPAGE_NOPOLICY | MSG_SENDPAGE_DECRYPTED) /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ #define SOL_IP 0 /* #define SOL_ICMP 1 No-no-no! Due to Linux :-) we cannot use SOL_ICMP=1 */ #define SOL_TCP 6 #define SOL_UDP 17 #define SOL_IPV6 41 #define SOL_ICMPV6 58 #define SOL_SCTP 132 #define SOL_UDPLITE 136 /* UDP-Lite (RFC 3828) */ #define SOL_RAW 255 #define SOL_IPX 256 #define SOL_AX25 257 #define SOL_ATALK 258 #define SOL_NETROM 259 #define SOL_ROSE 260 #define SOL_DECNET 261 #define SOL_X25 262 #define SOL_PACKET 263 #define SOL_ATM 264 /* ATM layer (cell level) */ #define SOL_AAL 265 /* ATM Adaption Layer (packet level) */ #define SOL_IRDA 266 #define SOL_NETBEUI 267 #define SOL_LLC 268 #define SOL_DCCP 269 #define SOL_NETLINK 270 #define SOL_TIPC 271 #define SOL_RXRPC 272 #define SOL_PPPOL2TP 273 #define SOL_BLUETOOTH 274 #define SOL_PNPIPE 275 #define SOL_RDS 276 #define SOL_IUCV 277 #define SOL_CAIF 278 #define SOL_ALG 279 #define SOL_NFC 280 #define SOL_KCM 281 #define SOL_TLS 282 #define SOL_XDP 283 #define SOL_MPTCP 284 #define SOL_MCTP 285 #define SOL_SMC 286 #define SOL_VSOCK 287 /* IPX options */ #define IPX_TYPE 1 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); extern int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, void *data); struct timespec64; struct __kernel_timespec; struct old_timespec32; struct scm_timestamping_internal { struct timespec64 ts[3]; }; extern void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss); extern void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss); /* The __sys_...msg variants allow MSG_CMSG_COMPAT iff * forbid_cmsg_compat==false */ extern long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat); extern long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat); extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct __kernel_timespec __user *timeout, struct old_timespec32 __user *timeout32); extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, bool forbid_cmsg_compat); extern long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg, unsigned int flags); extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags); extern int __copy_msghdr(struct msghdr *kmsg, struct user_msghdr *umsg, struct sockaddr __user **save_addr); /* helpers which do the actual work for syscalls */ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, struct sockaddr __user *addr, int __user *addr_len); extern int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr __user *addr, int addr_len); extern struct file *do_accept(struct file *file, struct proto_accept_arg *arg, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags); extern int __sys_socket(int family, int type, int protocol); extern struct file *__sys_socket_file(int family, int type, int protocol); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, int addrlen); extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, int addrlen, int file_flags); extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); extern int __sys_listen_socket(struct socket *sock, int backlog); extern int do_getsockname(struct socket *sock, int peer, struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len, int peer); extern int __sys_socketpair(int family, int type, int protocol, int __user *usockvec); extern int __sys_shutdown_sock(struct socket *sock, int how); extern int __sys_shutdown(int fd, int how); #endif /* _LINUX_SOCKET_H */
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 // SPDX-License-Identifier: GPL-2.0-only /* Helper handling for netfilter. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/random.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_log.h> #include <net/ip.h> static DEFINE_MUTEX(nf_ct_helper_mutex); struct hlist_head *nf_ct_helper_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hash); unsigned int nf_ct_helper_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); static struct list_head nf_ct_nat_helpers __read_mostly; /* Stupid hash, but collision free for the default registrations of the * helpers currently in the kernel. */ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) { return (((tuple->src.l3num << 8) | tuple->dst.protonum) ^ (__force __u16)tuple->src.u.all) % nf_ct_helper_hsize; } struct nf_conntrack_helper * __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) { struct nf_conntrack_helper *h; unsigned int i; for (i = 0; i < nf_ct_helper_hsize; i++) { hlist_for_each_entry_rcu(h, &nf_ct_helper_hash[i], hnode) { if (strcmp(h->name, name)) continue; if (h->tuple.src.l3num != NFPROTO_UNSPEC && h->tuple.src.l3num != l3num) continue; if (h->tuple.dst.protonum == protonum) return h; } } return NULL; } EXPORT_SYMBOL_GPL(__nf_conntrack_helper_find); struct nf_conntrack_helper * nf_conntrack_helper_try_module_get(const char *name, u16 l3num, u8 protonum) { struct nf_conntrack_helper *h; rcu_read_lock(); h = __nf_conntrack_helper_find(name, l3num, protonum); #ifdef CONFIG_MODULES if (h == NULL) { rcu_read_unlock(); if (request_module("nfct-helper-%s", name) == 0) { rcu_read_lock(); h = __nf_conntrack_helper_find(name, l3num, protonum); } else { return h; } } #endif if (h != NULL && !try_module_get(h->me)) h = NULL; if (h != NULL && !refcount_inc_not_zero(&h->refcnt)) { module_put(h->me); h = NULL; } rcu_read_unlock(); return h; } EXPORT_SYMBOL_GPL(nf_conntrack_helper_try_module_get); void nf_conntrack_helper_put(struct nf_conntrack_helper *helper) { refcount_dec(&helper->refcnt); module_put(helper->me); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_put); static struct nf_conntrack_nat_helper * nf_conntrack_nat_helper_find(const char *mod_name) { struct nf_conntrack_nat_helper *cur; bool found = false; list_for_each_entry_rcu(cur, &nf_ct_nat_helpers, list) { if (!strcmp(cur->mod_name, mod_name)) { found = true; break; } } return found ? cur : NULL; } int nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum) { struct nf_conntrack_helper *h; struct nf_conntrack_nat_helper *nat; char mod_name[NF_CT_HELPER_NAME_LEN]; int ret = 0; rcu_read_lock(); h = __nf_conntrack_helper_find(name, l3num, protonum); if (!h) { rcu_read_unlock(); return -ENOENT; } nat = nf_conntrack_nat_helper_find(h->nat_mod_name); if (!nat) { snprintf(mod_name, sizeof(mod_name), "%s", h->nat_mod_name); rcu_read_unlock(); request_module("%s", mod_name); rcu_read_lock(); nat = nf_conntrack_nat_helper_find(mod_name); if (!nat) { rcu_read_unlock(); return -ENOENT; } } if (!try_module_get(nat->module)) ret = -ENOENT; rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(nf_nat_helper_try_module_get); void nf_nat_helper_put(struct nf_conntrack_helper *helper) { struct nf_conntrack_nat_helper *nat; nat = nf_conntrack_nat_helper_find(helper->nat_mod_name); if (WARN_ON_ONCE(!nat)) return; module_put(nat->module); } EXPORT_SYMBOL_GPL(nf_nat_helper_put); struct nf_conn_help * nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) { struct nf_conn_help *help; help = nf_ct_ext_add(ct, NF_CT_EXT_HELPER, gfp); if (help) INIT_HLIST_HEAD(&help->expectations); else pr_debug("failed to add helper extension area"); return help; } EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags) { struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; /* We already got a helper explicitly attached (e.g. nft_ct) */ if (test_bit(IPS_HELPER_BIT, &ct->status)) return 0; if (WARN_ON_ONCE(!tmpl)) return 0; help = nfct_help(tmpl); if (help != NULL) { helper = rcu_dereference(help->helper); set_bit(IPS_HELPER_BIT, &ct->status); } help = nfct_help(ct); if (helper == NULL) { if (help) RCU_INIT_POINTER(help->helper, NULL); return 0; } if (help == NULL) { help = nf_ct_helper_ext_add(ct, flags); if (help == NULL) return -ENOMEM; } else { /* We only allow helper re-assignment of the same sort since * we cannot reallocate the helper extension area. */ struct nf_conntrack_helper *tmp = rcu_dereference(help->helper); if (tmp && tmp->help != helper->help) { RCU_INIT_POINTER(help->helper, NULL); return 0; } } rcu_assign_pointer(help->helper, helper); return 0; } EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); /* appropriate ct lock protecting must be taken by caller */ static int unhelp(struct nf_conn *ct, void *me) { struct nf_conn_help *help = nfct_help(ct); if (help && rcu_dereference_raw(help->helper) == me) { nf_conntrack_event(IPCT_HELPER, ct); RCU_INIT_POINTER(help->helper, NULL); } /* We are not intended to delete this conntrack. */ return 0; } void nf_ct_helper_destroy(struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_helper *helper; if (help) { rcu_read_lock(); helper = rcu_dereference(help->helper); if (helper && helper->destroy) helper->destroy(ct); rcu_read_unlock(); } } static LIST_HEAD(nf_ct_helper_expectfn_list); void nf_ct_helper_expectfn_register(struct nf_ct_helper_expectfn *n) { spin_lock_bh(&nf_conntrack_expect_lock); list_add_rcu(&n->head, &nf_ct_helper_expectfn_list); spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_register); void nf_ct_helper_expectfn_unregister(struct nf_ct_helper_expectfn *n) { spin_lock_bh(&nf_conntrack_expect_lock); list_del_rcu(&n->head); spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_unregister); /* Caller should hold the rcu lock */ struct nf_ct_helper_expectfn * nf_ct_helper_expectfn_find_by_name(const char *name) { struct nf_ct_helper_expectfn *cur; bool found = false; list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { if (!strcmp(cur->name, name)) { found = true; break; } } return found ? cur : NULL; } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_name); /* Caller should hold the rcu lock */ struct nf_ct_helper_expectfn * nf_ct_helper_expectfn_find_by_symbol(const void *symbol) { struct nf_ct_helper_expectfn *cur; bool found = false; list_for_each_entry_rcu(cur, &nf_ct_helper_expectfn_list, head) { if (cur->expectfn == symbol) { found = true; break; } } return found ? cur : NULL; } EXPORT_SYMBOL_GPL(nf_ct_helper_expectfn_find_by_symbol); __printf(3, 4) void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, const char *fmt, ...) { const struct nf_conn_help *help; const struct nf_conntrack_helper *helper; struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; /* Called from the helper function, this call never fails */ help = nfct_help(ct); /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, "nf_ct_%s: dropping packet: %pV ", helper->name, &vaf); va_end(args); } EXPORT_SYMBOL_GPL(nf_ct_helper_log); int nf_conntrack_helper_register(struct nf_conntrack_helper *me) { struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; unsigned int h = helper_hash(&me->tuple); struct nf_conntrack_helper *cur; int ret = 0, i; BUG_ON(me->expect_policy == NULL); BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); if (!nf_ct_helper_hash) return -ENOENT; if (me->expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) return -EINVAL; mutex_lock(&nf_ct_helper_mutex); for (i = 0; i < nf_ct_helper_hsize; i++) { hlist_for_each_entry(cur, &nf_ct_helper_hash[i], hnode) { if (!strcmp(cur->name, me->name) && (cur->tuple.src.l3num == NFPROTO_UNSPEC || cur->tuple.src.l3num == me->tuple.src.l3num) && cur->tuple.dst.protonum == me->tuple.dst.protonum) { ret = -EBUSY; goto out; } } } /* avoid unpredictable behaviour for auto_assign_helper */ if (!(me->flags & NF_CT_HELPER_F_USERSPACE)) { hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { ret = -EBUSY; goto out; } } } refcount_set(&me->refcnt, 1); hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); nf_ct_helper_count++; out: mutex_unlock(&nf_ct_helper_mutex); return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); static bool expect_iter_me(struct nf_conntrack_expect *exp, void *data) { struct nf_conn_help *help = nfct_help(exp->master); const struct nf_conntrack_helper *me = data; const struct nf_conntrack_helper *this; if (exp->helper == me) return true; this = rcu_dereference_protected(help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); return this == me; } void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) { mutex_lock(&nf_ct_helper_mutex); hlist_del_rcu(&me->hnode); nf_ct_helper_count--; mutex_unlock(&nf_ct_helper_mutex); /* Make sure every nothing is still using the helper unless its a * connection in the hash. */ synchronize_rcu(); nf_ct_expect_iterate_destroy(expect_iter_me, NULL); nf_ct_iterate_destroy(unhelp, me); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); void nf_ct_helper_init(struct nf_conntrack_helper *helper, u16 l3num, u16 protonum, const char *name, u16 default_port, u16 spec_port, u32 id, const struct nf_conntrack_expect_policy *exp_pol, u32 expect_class_max, int (*help)(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo), int (*from_nlattr)(struct nlattr *attr, struct nf_conn *ct), struct module *module) { helper->tuple.src.l3num = l3num; helper->tuple.dst.protonum = protonum; helper->tuple.src.u.all = htons(spec_port); helper->expect_policy = exp_pol; helper->expect_class_max = expect_class_max; helper->help = help; helper->from_nlattr = from_nlattr; helper->me = module; snprintf(helper->nat_mod_name, sizeof(helper->nat_mod_name), NF_NAT_HELPER_PREFIX "%s", name); if (spec_port == default_port) snprintf(helper->name, sizeof(helper->name), "%s", name); else snprintf(helper->name, sizeof(helper->name), "%s-%u", name, id); } EXPORT_SYMBOL_GPL(nf_ct_helper_init); int nf_conntrack_helpers_register(struct nf_conntrack_helper *helper, unsigned int n) { unsigned int i; int err = 0; for (i = 0; i < n; i++) { err = nf_conntrack_helper_register(&helper[i]); if (err < 0) goto err; } return err; err: if (i > 0) nf_conntrack_helpers_unregister(helper, i); return err; } EXPORT_SYMBOL_GPL(nf_conntrack_helpers_register); void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *helper, unsigned int n) { while (n-- > 0) nf_conntrack_helper_unregister(&helper[n]); } EXPORT_SYMBOL_GPL(nf_conntrack_helpers_unregister); void nf_nat_helper_register(struct nf_conntrack_nat_helper *nat) { mutex_lock(&nf_ct_nat_helpers_mutex); list_add_rcu(&nat->list, &nf_ct_nat_helpers); mutex_unlock(&nf_ct_nat_helpers_mutex); } EXPORT_SYMBOL_GPL(nf_nat_helper_register); void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat) { mutex_lock(&nf_ct_nat_helpers_mutex); list_del_rcu(&nat->list); mutex_unlock(&nf_ct_nat_helpers_mutex); } EXPORT_SYMBOL_GPL(nf_nat_helper_unregister); int nf_conntrack_helper_init(void) { nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); if (!nf_ct_helper_hash) return -ENOMEM; INIT_LIST_HEAD(&nf_ct_nat_helpers); return 0; } void nf_conntrack_helper_fini(void) { kvfree(nf_ct_helper_hash); nf_ct_helper_hash = NULL; }
482 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/types.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/pkt_sched.h> #include <net/caif/caif_layer.h> #include <net/caif/cfsrvl.h> #include <net/caif/cfpkt.h> #include <net/caif/caif_dev.h> #define SRVL_CTRL_PKT_SIZE 1 #define SRVL_FLOW_OFF 0x81 #define SRVL_FLOW_ON 0x80 #define SRVL_SET_PIN 0x82 #define container_obj(layr) container_of(layr, struct cfsrvl, layer) static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { struct cfsrvl *service = container_obj(layr); if (layr->up == NULL || layr->up->ctrlcmd == NULL) return; switch (ctrl) { case CAIF_CTRLCMD_INIT_RSP: service->open = true; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case CAIF_CTRLCMD_DEINIT_RSP: case CAIF_CTRLCMD_INIT_FAIL_RSP: service->open = false; layr->up->ctrlcmd(layr->up, ctrl, phyid); break; case _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND: if (phyid != service->dev_info.id) break; if (service->modem_flow_on) layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); service->phy_flow_on = false; break; case _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND: if (phyid != service->dev_info.id) return; if (service->modem_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->phy_flow_on = true; break; case CAIF_CTRLCMD_FLOW_OFF_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_OFF_IND, phyid); } service->modem_flow_on = false; break; case CAIF_CTRLCMD_FLOW_ON_IND: if (service->phy_flow_on) { layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_FLOW_ON_IND, phyid); } service->modem_flow_on = true; break; case _CAIF_CTRLCMD_PHYIF_DOWN_IND: /* In case interface is down, let's fake a remove shutdown */ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, phyid); break; case CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: layr->up->ctrlcmd(layr->up, ctrl, phyid); break; default: pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl); /* We have both modem and phy flow on, send flow on */ layr->up->ctrlcmd(layr->up, ctrl, phyid); service->phy_flow_on = true; break; } } static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl) { struct cfsrvl *service = container_obj(layr); caif_assert(layr != NULL); caif_assert(layr->dn != NULL); caif_assert(layr->dn->transmit != NULL); if (!service->supports_flowctrl) return 0; switch (ctrl) { case CAIF_MODEMCMD_FLOW_ON_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_on = SRVL_FLOW_ON; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_on, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } case CAIF_MODEMCMD_FLOW_OFF_REQ: { struct cfpkt *pkt; struct caif_payload_info *info; u8 flow_off = SRVL_FLOW_OFF; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) return -ENOMEM; if (cfpkt_add_head(pkt, &flow_off, 1) < 0) { pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } info = cfpkt_info(pkt); info->channel_id = service->layer.id; info->hdr_len = 1; info->dev_info = &service->dev_info; cfpkt_set_prio(pkt, TC_PRIO_CONTROL); return layr->dn->transmit(layr->dn, pkt); } default: break; } return -EINVAL; } static void cfsrvl_release(struct cflayer *layer) { struct cfsrvl *service = container_of(layer, struct cfsrvl, layer); kfree(service); } void cfsrvl_init(struct cfsrvl *service, u8 channel_id, struct dev_info *dev_info, bool supports_flowctrl) { caif_assert(offsetof(struct cfsrvl, layer) == 0); service->open = false; service->modem_flow_on = true; service->phy_flow_on = true; service->layer.id = channel_id; service->layer.ctrlcmd = cfservl_ctrlcmd; service->layer.modemcmd = cfservl_modemcmd; service->dev_info = *dev_info; service->supports_flowctrl = supports_flowctrl; service->release = cfsrvl_release; } bool cfsrvl_ready(struct cfsrvl *service, int *err) { if (!service->open) { *err = -ENOTCONN; return false; } return true; } bool cfsrvl_phyid_match(struct cflayer *layer, int phyid) { struct cfsrvl *servl = container_obj(layer); return servl->dev_info.id == phyid; } void caif_free_client(struct cflayer *adap_layer) { struct cfsrvl *servl; if (adap_layer == NULL || adap_layer->dn == NULL) return; servl = container_obj(adap_layer->dn); servl->release(&servl->layer); } EXPORT_SYMBOL(caif_free_client); void caif_client_register_refcnt(struct cflayer *adapt_layer, void (*hold)(struct cflayer *lyr), void (*put)(struct cflayer *lyr)) { struct cfsrvl *service; if (WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL)) return; service = container_of(adapt_layer->dn, struct cfsrvl, layer); service->hold = hold; service->put = put; } EXPORT_SYMBOL(caif_client_register_refcnt);
4 2 2 9 9 9 9 9 5 9 9 9 9 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 8 9 1 8 8 2 6 6 6 6 7 9 4 9 5 4 5 4 4 6 6 3 2 2 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005,2006,2007,2008 IBM Corporation * * Authors: * Kylene Hall <kjhall@us.ibm.com> * Reiner Sailer <sailer@us.ibm.com> * Mimi Zohar <zohar@us.ibm.com> * * File: ima_fs.c * implemenents security file system for reporting * current measurement list and IMA statistics */ #include <linux/fcntl.h> #include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/seq_file.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/parser.h> #include <linux/vmalloc.h> #include "ima.h" static DEFINE_MUTEX(ima_write_mutex); bool ima_canonical_fmt; static int __init default_canonical_fmt_setup(char *str) { #ifdef __BIG_ENDIAN ima_canonical_fmt = true; #endif return 1; } __setup("ima_canonical_fmt", default_canonical_fmt_setup); static int valid_policy = 1; static ssize_t ima_show_htable_value(char __user *buf, size_t count, loff_t *ppos, atomic_long_t *val) { char tmpbuf[32]; /* greater than largest 'long' string value */ ssize_t len; len = scnprintf(tmpbuf, sizeof(tmpbuf), "%li\n", atomic_long_read(val)); return simple_read_from_buffer(buf, count, ppos, tmpbuf, len); } static ssize_t ima_show_htable_violations(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { return ima_show_htable_value(buf, count, ppos, &ima_htable.violations); } static const struct file_operations ima_htable_violations_ops = { .read = ima_show_htable_violations, .llseek = generic_file_llseek, }; static ssize_t ima_show_measurements_count(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { return ima_show_htable_value(buf, count, ppos, &ima_htable.len); } static const struct file_operations ima_measurements_count_ops = { .read = ima_show_measurements_count, .llseek = generic_file_llseek, }; /* returns pointer to hlist_node */ static void *ima_measurements_start(struct seq_file *m, loff_t *pos) { loff_t l = *pos; struct ima_queue_entry *qe; /* we need a lock since pos could point beyond last element */ rcu_read_lock(); list_for_each_entry_rcu(qe, &ima_measurements, later) { if (!l--) { rcu_read_unlock(); return qe; } } rcu_read_unlock(); return NULL; } static void *ima_measurements_next(struct seq_file *m, void *v, loff_t *pos) { struct ima_queue_entry *qe = v; /* lock protects when reading beyond last element * against concurrent list-extension */ rcu_read_lock(); qe = list_entry_rcu(qe->later.next, struct ima_queue_entry, later); rcu_read_unlock(); (*pos)++; return (&qe->later == &ima_measurements) ? NULL : qe; } static void ima_measurements_stop(struct seq_file *m, void *v) { } void ima_putc(struct seq_file *m, void *data, int datalen) { while (datalen--) seq_putc(m, *(char *)data++); } /* print format: * 32bit-le=pcr# * char[n]=template digest * 32bit-le=template name size * char[n]=template name * [eventdata length] * eventdata[n]=template specific data */ int ima_measurements_show(struct seq_file *m, void *v) { /* the list never shrinks, so we don't need a lock here */ struct ima_queue_entry *qe = v; struct ima_template_entry *e; char *template_name; u32 pcr, namelen, template_data_len; /* temporary fields */ bool is_ima_template = false; enum hash_algo algo; int i, algo_idx; algo_idx = ima_sha1_idx; algo = HASH_ALGO_SHA1; if (m->file != NULL) { algo_idx = (unsigned long)file_inode(m->file)->i_private; algo = ima_algo_array[algo_idx].algo; } /* get entry */ e = qe->entry; if (e == NULL) return -1; template_name = (e->template_desc->name[0] != '\0') ? e->template_desc->name : e->template_desc->fmt; /* * 1st: PCRIndex * PCR used defaults to the same (config option) in * little-endian format, unless set in policy */ pcr = !ima_canonical_fmt ? e->pcr : (__force u32)cpu_to_le32(e->pcr); ima_putc(m, &pcr, sizeof(e->pcr)); /* 2nd: template digest */ ima_putc(m, e->digests[algo_idx].digest, hash_digest_size[algo]); /* 3rd: template name size */ namelen = !ima_canonical_fmt ? strlen(template_name) : (__force u32)cpu_to_le32(strlen(template_name)); ima_putc(m, &namelen, sizeof(namelen)); /* 4th: template name */ ima_putc(m, template_name, strlen(template_name)); /* 5th: template length (except for 'ima' template) */ if (strcmp(template_name, IMA_TEMPLATE_IMA_NAME) == 0) is_ima_template = true; if (!is_ima_template) { template_data_len = !ima_canonical_fmt ? e->template_data_len : (__force u32)cpu_to_le32(e->template_data_len); ima_putc(m, &template_data_len, sizeof(e->template_data_len)); } /* 6th: template specific data */ for (i = 0; i < e->template_desc->num_fields; i++) { enum ima_show_type show = IMA_SHOW_BINARY; const struct ima_template_field *field = e->template_desc->fields[i]; if (is_ima_template && strcmp(field->field_id, "d") == 0) show = IMA_SHOW_BINARY_NO_FIELD_LEN; if (is_ima_template && strcmp(field->field_id, "n") == 0) show = IMA_SHOW_BINARY_OLD_STRING_FMT; field->field_show(m, show, &e->template_data[i]); } return 0; } static const struct seq_operations ima_measurments_seqops = { .start = ima_measurements_start, .next = ima_measurements_next, .stop = ima_measurements_stop, .show = ima_measurements_show }; static int ima_measurements_open(struct inode *inode, struct file *file) { return seq_open(file, &ima_measurments_seqops); } static const struct file_operations ima_measurements_ops = { .open = ima_measurements_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; void ima_print_digest(struct seq_file *m, u8 *digest, u32 size) { u32 i; for (i = 0; i < size; i++) seq_printf(m, "%02x", *(digest + i)); } /* print in ascii */ static int ima_ascii_measurements_show(struct seq_file *m, void *v) { /* the list never shrinks, so we don't need a lock here */ struct ima_queue_entry *qe = v; struct ima_template_entry *e; char *template_name; enum hash_algo algo; int i, algo_idx; algo_idx = ima_sha1_idx; algo = HASH_ALGO_SHA1; if (m->file != NULL) { algo_idx = (unsigned long)file_inode(m->file)->i_private; algo = ima_algo_array[algo_idx].algo; } /* get entry */ e = qe->entry; if (e == NULL) return -1; template_name = (e->template_desc->name[0] != '\0') ? e->template_desc->name : e->template_desc->fmt; /* 1st: PCR used (config option) */ seq_printf(m, "%2d ", e->pcr); /* 2nd: template hash */ ima_print_digest(m, e->digests[algo_idx].digest, hash_digest_size[algo]); /* 3th: template name */ seq_printf(m, " %s", template_name); /* 4th: template specific data */ for (i = 0; i < e->template_desc->num_fields; i++) { seq_puts(m, " "); if (e->template_data[i].len == 0) continue; e->template_desc->fields[i]->field_show(m, IMA_SHOW_ASCII, &e->template_data[i]); } seq_puts(m, "\n"); return 0; } static const struct seq_operations ima_ascii_measurements_seqops = { .start = ima_measurements_start, .next = ima_measurements_next, .stop = ima_measurements_stop, .show = ima_ascii_measurements_show }; static int ima_ascii_measurements_open(struct inode *inode, struct file *file) { return seq_open(file, &ima_ascii_measurements_seqops); } static const struct file_operations ima_ascii_measurements_ops = { .open = ima_ascii_measurements_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; static ssize_t ima_read_policy(char *path) { void *data = NULL; char *datap; size_t size; int rc, pathlen = strlen(path); char *p; /* remove \n */ datap = path; strsep(&datap, "\n"); rc = kernel_read_file_from_path(path, 0, &data, INT_MAX, NULL, READING_POLICY); if (rc < 0) { pr_err("Unable to open file: %s (%d)", path, rc); return rc; } size = rc; rc = 0; datap = data; while (size > 0 && (p = strsep(&datap, "\n"))) { pr_debug("rule: %s\n", p); rc = ima_parse_add_rule(p); if (rc < 0) break; size -= rc; } vfree(data); if (rc < 0) return rc; else if (size) return -EINVAL; else return pathlen; } static ssize_t ima_write_policy(struct file *file, const char __user *buf, size_t datalen, loff_t *ppos) { char *data; ssize_t result; if (datalen >= PAGE_SIZE) datalen = PAGE_SIZE - 1; /* No partial writes. */ result = -EINVAL; if (*ppos != 0) goto out; data = memdup_user_nul(buf, datalen); if (IS_ERR(data)) { result = PTR_ERR(data); goto out; } result = mutex_lock_interruptible(&ima_write_mutex); if (result < 0) goto out_free; if (data[0] == '/') { result = ima_read_policy(data); } else if (ima_appraise & IMA_APPRAISE_POLICY) { pr_err("signed policy file (specified as an absolute pathname) required\n"); integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL, "policy_update", "signed policy required", 1, 0); result = -EACCES; } else { result = ima_parse_add_rule(data); } mutex_unlock(&ima_write_mutex); out_free: kfree(data); out: if (result < 0) valid_policy = 0; return result; } static struct dentry *ima_dir; static struct dentry *ima_symlink; enum ima_fs_flags { IMA_FS_BUSY, }; static unsigned long ima_fs_flags; #ifdef CONFIG_IMA_READ_POLICY static const struct seq_operations ima_policy_seqops = { .start = ima_policy_start, .next = ima_policy_next, .stop = ima_policy_stop, .show = ima_policy_show, }; #endif static int __init create_securityfs_measurement_lists(void) { int count = NR_BANKS(ima_tpm_chip); if (ima_sha1_idx >= NR_BANKS(ima_tpm_chip)) count++; for (int i = 0; i < count; i++) { u16 algo = ima_algo_array[i].algo; char file_name[NAME_MAX + 1]; struct dentry *dentry; sprintf(file_name, "ascii_runtime_measurements_%s", hash_algo_name[algo]); dentry = securityfs_create_file(file_name, S_IRUSR | S_IRGRP, ima_dir, (void *)(uintptr_t)i, &ima_ascii_measurements_ops); if (IS_ERR(dentry)) return PTR_ERR(dentry); sprintf(file_name, "binary_runtime_measurements_%s", hash_algo_name[algo]); dentry = securityfs_create_file(file_name, S_IRUSR | S_IRGRP, ima_dir, (void *)(uintptr_t)i, &ima_measurements_ops); if (IS_ERR(dentry)) return PTR_ERR(dentry); } return 0; } /* * ima_open_policy: sequentialize access to the policy file */ static int ima_open_policy(struct inode *inode, struct file *filp) { if (!(filp->f_flags & O_WRONLY)) { #ifndef CONFIG_IMA_READ_POLICY return -EACCES; #else if ((filp->f_flags & O_ACCMODE) != O_RDONLY) return -EACCES; if (!capable(CAP_SYS_ADMIN)) return -EPERM; return seq_open(filp, &ima_policy_seqops); #endif } if (test_and_set_bit(IMA_FS_BUSY, &ima_fs_flags)) return -EBUSY; return 0; } /* * ima_release_policy - start using the new measure policy rules. * * Initially, ima_measure points to the default policy rules, now * point to the new policy rules, and remove the securityfs policy file, * assuming a valid policy. */ static int ima_release_policy(struct inode *inode, struct file *file) { const char *cause = valid_policy ? "completed" : "failed"; if ((file->f_flags & O_ACCMODE) == O_RDONLY) return seq_release(inode, file); if (valid_policy && ima_check_policy() < 0) { cause = "failed"; valid_policy = 0; } pr_info("policy update %s\n", cause); integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL, "policy_update", cause, !valid_policy, 0); if (!valid_policy) { ima_delete_rules(); valid_policy = 1; clear_bit(IMA_FS_BUSY, &ima_fs_flags); return 0; } ima_update_policy(); #if !defined(CONFIG_IMA_WRITE_POLICY) && !defined(CONFIG_IMA_READ_POLICY) securityfs_remove(file->f_path.dentry); #elif defined(CONFIG_IMA_WRITE_POLICY) clear_bit(IMA_FS_BUSY, &ima_fs_flags); #elif defined(CONFIG_IMA_READ_POLICY) inode->i_mode &= ~S_IWUSR; #endif return 0; } static const struct file_operations ima_measure_policy_ops = { .open = ima_open_policy, .write = ima_write_policy, .read = seq_read, .release = ima_release_policy, .llseek = generic_file_llseek, }; int __init ima_fs_init(void) { struct dentry *dentry; int ret; ret = integrity_fs_init(); if (ret < 0) return ret; ima_dir = securityfs_create_dir("ima", integrity_dir); if (IS_ERR(ima_dir)) { ret = PTR_ERR(ima_dir); goto out; } ima_symlink = securityfs_create_symlink("ima", NULL, "integrity/ima", NULL); if (IS_ERR(ima_symlink)) { ret = PTR_ERR(ima_symlink); goto out; } ret = create_securityfs_measurement_lists(); if (ret != 0) goto out; dentry = securityfs_create_symlink("binary_runtime_measurements", ima_dir, "binary_runtime_measurements_sha1", NULL); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out; } dentry = securityfs_create_symlink("ascii_runtime_measurements", ima_dir, "ascii_runtime_measurements_sha1", NULL); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out; } dentry = securityfs_create_file("runtime_measurements_count", S_IRUSR | S_IRGRP, ima_dir, NULL, &ima_measurements_count_ops); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out; } dentry = securityfs_create_file("violations", S_IRUSR | S_IRGRP, ima_dir, NULL, &ima_htable_violations_ops); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out; } dentry = securityfs_create_file("policy", POLICY_FILE_FLAGS, ima_dir, NULL, &ima_measure_policy_ops); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out; } return 0; out: securityfs_remove(ima_symlink); securityfs_remove(ima_dir); integrity_fs_fini(); return ret; }
15 15 15 4 4 4 3 3 3 3 3 1 4 4 4 4 3 3 3 1 2 2 2 2 2 2 2 2 2 2 2 279 279 279 279 279 242 60 240 242 240 242 241 242 201 242 239 242 139 140 83 84 83 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 // SPDX-License-Identifier: GPL-2.0-or-later /* * Initialization routines * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/device.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/ctype.h> #include <linux/pm.h> #include <linux/debugfs.h> #include <linux/completion.h> #include <linux/interrupt.h> #include <sound/core.h> #include <sound/control.h> #include <sound/info.h> /* monitor files for graceful shutdown (hotplug) */ struct snd_monitor_file { struct file *file; const struct file_operations *disconnected_f_op; struct list_head shutdown_list; /* still need to shutdown */ struct list_head list; /* link of monitor files */ }; static DEFINE_SPINLOCK(shutdown_lock); static LIST_HEAD(shutdown_files); static const struct file_operations snd_shutdown_f_ops; /* locked for registering/using */ static DECLARE_BITMAP(snd_cards_lock, SNDRV_CARDS); static struct snd_card *snd_cards[SNDRV_CARDS]; static DEFINE_MUTEX(snd_card_mutex); static char *slots[SNDRV_CARDS]; module_param_array(slots, charp, NULL, 0444); MODULE_PARM_DESC(slots, "Module names assigned to the slots."); /* return non-zero if the given index is reserved for the given * module via slots option */ static int module_slot_match(struct module *module, int idx) { int match = 1; #ifdef CONFIG_MODULES const char *s1, *s2; if (!module || !*module->name || !slots[idx]) return 0; s1 = module->name; s2 = slots[idx]; if (*s2 == '!') { match = 0; /* negative match */ s2++; } /* compare module name strings * hyphens are handled as equivalent with underscore */ for (;;) { char c1 = *s1++; char c2 = *s2++; if (c1 == '-') c1 = '_'; if (c2 == '-') c2 = '_'; if (c1 != c2) return !match; if (!c1) break; } #endif /* CONFIG_MODULES */ return match; } #if IS_ENABLED(CONFIG_SND_MIXER_OSS) int (*snd_mixer_oss_notify_callback)(struct snd_card *card, int free_flag); EXPORT_SYMBOL(snd_mixer_oss_notify_callback); #endif static int check_empty_slot(struct module *module, int slot) { return !slots[slot] || !*slots[slot]; } /* return an empty slot number (>= 0) found in the given bitmask @mask. * @mask == -1 == 0xffffffff means: take any free slot up to 32 * when no slot is available, return the original @mask as is. */ static int get_slot_from_bitmask(int mask, int (*check)(struct module *, int), struct module *module) { int slot; for (slot = 0; slot < SNDRV_CARDS; slot++) { if (slot < 32 && !(mask & (1U << slot))) continue; if (!test_bit(slot, snd_cards_lock)) { if (check(module, slot)) return slot; /* found */ } } return mask; /* unchanged */ } /* the default release callback set in snd_device_alloc() */ static void default_release_alloc(struct device *dev) { kfree(dev); } /** * snd_device_alloc - Allocate and initialize struct device for sound devices * @dev_p: pointer to store the allocated device * @card: card to assign, optional * * For releasing the allocated device, call put_device(). */ int snd_device_alloc(struct device **dev_p, struct snd_card *card) { struct device *dev; *dev_p = NULL; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; device_initialize(dev); if (card) dev->parent = &card->card_dev; dev->class = &sound_class; dev->release = default_release_alloc; *dev_p = dev; return 0; } EXPORT_SYMBOL_GPL(snd_device_alloc); static int snd_card_init(struct snd_card *card, struct device *parent, int idx, const char *xid, struct module *module, size_t extra_size); static int snd_card_do_free(struct snd_card *card); static const struct attribute_group card_dev_attr_group; static void release_card_device(struct device *dev) { snd_card_do_free(dev_to_snd_card(dev)); } /** * snd_card_new - create and initialize a soundcard structure * @parent: the parent device object * @idx: card index (address) [0 ... (SNDRV_CARDS-1)] * @xid: card identification (ASCII string) * @module: top level module for locking * @extra_size: allocate this extra size after the main soundcard structure * @card_ret: the pointer to store the created card instance * * The function allocates snd_card instance via kzalloc with the given * space for the driver to use freely. The allocated struct is stored * in the given card_ret pointer. * * Return: Zero if successful or a negative error code. */ int snd_card_new(struct device *parent, int idx, const char *xid, struct module *module, int extra_size, struct snd_card **card_ret) { struct snd_card *card; int err; if (snd_BUG_ON(!card_ret)) return -EINVAL; *card_ret = NULL; if (extra_size < 0) extra_size = 0; card = kzalloc(sizeof(*card) + extra_size, GFP_KERNEL); if (!card) return -ENOMEM; err = snd_card_init(card, parent, idx, xid, module, extra_size); if (err < 0) return err; /* card is freed by error handler */ *card_ret = card; return 0; } EXPORT_SYMBOL(snd_card_new); static void __snd_card_release(struct device *dev, void *data) { snd_card_free(data); } /** * snd_devm_card_new - managed snd_card object creation * @parent: the parent device object * @idx: card index (address) [0 ... (SNDRV_CARDS-1)] * @xid: card identification (ASCII string) * @module: top level module for locking * @extra_size: allocate this extra size after the main soundcard structure * @card_ret: the pointer to store the created card instance * * This function works like snd_card_new() but manages the allocated resource * via devres, i.e. you don't need to free explicitly. * * When a snd_card object is created with this function and registered via * snd_card_register(), the very first devres action to call snd_card_free() * is added automatically. In that way, the resource disconnection is assured * at first, then released in the expected order. * * If an error happens at the probe before snd_card_register() is called and * there have been other devres resources, you'd need to free the card manually * via snd_card_free() call in the error; otherwise it may lead to UAF due to * devres call orders. You can use snd_card_free_on_error() helper for * handling it more easily. * * Return: zero if successful, or a negative error code */ int snd_devm_card_new(struct device *parent, int idx, const char *xid, struct module *module, size_t extra_size, struct snd_card **card_ret) { struct snd_card *card; int err; *card_ret = NULL; card = devres_alloc(__snd_card_release, sizeof(*card) + extra_size, GFP_KERNEL); if (!card) return -ENOMEM; card->managed = true; err = snd_card_init(card, parent, idx, xid, module, extra_size); if (err < 0) { devres_free(card); /* in managed mode, we need to free manually */ return err; } devres_add(parent, card); *card_ret = card; return 0; } EXPORT_SYMBOL_GPL(snd_devm_card_new); /** * snd_card_free_on_error - a small helper for handling devm probe errors * @dev: the managed device object * @ret: the return code from the probe callback * * This function handles the explicit snd_card_free() call at the error from * the probe callback. It's just a small helper for simplifying the error * handling with the managed devices. * * Return: zero if successful, or a negative error code */ int snd_card_free_on_error(struct device *dev, int ret) { struct snd_card *card; if (!ret) return 0; card = devres_find(dev, __snd_card_release, NULL, NULL); if (card) snd_card_free(card); return ret; } EXPORT_SYMBOL_GPL(snd_card_free_on_error); static int snd_card_init(struct snd_card *card, struct device *parent, int idx, const char *xid, struct module *module, size_t extra_size) { int err; if (extra_size > 0) card->private_data = (char *)card + sizeof(struct snd_card); if (xid) strscpy(card->id, xid, sizeof(card->id)); err = 0; scoped_guard(mutex, &snd_card_mutex) { if (idx < 0) /* first check the matching module-name slot */ idx = get_slot_from_bitmask(idx, module_slot_match, module); if (idx < 0) /* if not matched, assign an empty slot */ idx = get_slot_from_bitmask(idx, check_empty_slot, module); if (idx < 0) err = -ENODEV; else if (idx < snd_ecards_limit) { if (test_bit(idx, snd_cards_lock)) err = -EBUSY; /* invalid */ } else if (idx >= SNDRV_CARDS) err = -ENODEV; if (!err) { set_bit(idx, snd_cards_lock); /* lock it */ if (idx >= snd_ecards_limit) snd_ecards_limit = idx + 1; /* increase the limit */ } } if (err < 0) { dev_err(parent, "cannot find the slot for index %d (range 0-%i), error: %d\n", idx, snd_ecards_limit - 1, err); if (!card->managed) kfree(card); /* manually free here, as no destructor called */ return err; } card->dev = parent; card->number = idx; WARN_ON(IS_MODULE(CONFIG_SND) && !module); card->module = module; INIT_LIST_HEAD(&card->devices); init_rwsem(&card->controls_rwsem); rwlock_init(&card->controls_rwlock); INIT_LIST_HEAD(&card->controls); INIT_LIST_HEAD(&card->ctl_files); #ifdef CONFIG_SND_CTL_FAST_LOOKUP xa_init(&card->ctl_numids); xa_init(&card->ctl_hash); #endif spin_lock_init(&card->files_lock); INIT_LIST_HEAD(&card->files_list); mutex_init(&card->memory_mutex); #ifdef CONFIG_PM init_waitqueue_head(&card->power_sleep); init_waitqueue_head(&card->power_ref_sleep); atomic_set(&card->power_ref, 0); #endif init_waitqueue_head(&card->remove_sleep); card->sync_irq = -1; device_initialize(&card->card_dev); card->card_dev.parent = parent; card->card_dev.class = &sound_class; card->card_dev.release = release_card_device; card->card_dev.groups = card->dev_groups; card->dev_groups[0] = &card_dev_attr_group; err = kobject_set_name(&card->card_dev.kobj, "card%d", idx); if (err < 0) goto __error; snprintf(card->irq_descr, sizeof(card->irq_descr), "%s:%s", dev_driver_string(card->dev), dev_name(&card->card_dev)); /* the control interface cannot be accessed from the user space until */ /* snd_cards_bitmask and snd_cards are set with snd_card_register */ err = snd_ctl_create(card); if (err < 0) { dev_err(parent, "unable to register control minors\n"); goto __error; } err = snd_info_card_create(card); if (err < 0) { dev_err(parent, "unable to create card info\n"); goto __error_ctl; } #ifdef CONFIG_SND_DEBUG card->debugfs_root = debugfs_create_dir(dev_name(&card->card_dev), sound_debugfs_root); #endif return 0; __error_ctl: snd_device_free_all(card); __error: put_device(&card->card_dev); return err; } /** * snd_card_ref - Get the card object from the index * @idx: the card index * * Returns a card object corresponding to the given index or NULL if not found. * Release the object via snd_card_unref(). * * Return: a card object or NULL */ struct snd_card *snd_card_ref(int idx) { struct snd_card *card; guard(mutex)(&snd_card_mutex); card = snd_cards[idx]; if (card) get_device(&card->card_dev); return card; } EXPORT_SYMBOL_GPL(snd_card_ref); /* return non-zero if a card is already locked */ int snd_card_locked(int card) { guard(mutex)(&snd_card_mutex); return test_bit(card, snd_cards_lock); } static loff_t snd_disconnect_llseek(struct file *file, loff_t offset, int orig) { return -ENODEV; } static ssize_t snd_disconnect_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { return -ENODEV; } static ssize_t snd_disconnect_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { return -ENODEV; } static int snd_disconnect_release(struct inode *inode, struct file *file) { struct snd_monitor_file *df = NULL, *_df; scoped_guard(spinlock, &shutdown_lock) { list_for_each_entry(_df, &shutdown_files, shutdown_list) { if (_df->file == file) { df = _df; list_del_init(&df->shutdown_list); break; } } } if (likely(df)) { if ((file->f_flags & FASYNC) && df->disconnected_f_op->fasync) df->disconnected_f_op->fasync(-1, file, 0); return df->disconnected_f_op->release(inode, file); } panic("%s(%p, %p) failed!", __func__, inode, file); } static __poll_t snd_disconnect_poll(struct file * file, poll_table * wait) { return EPOLLERR | EPOLLNVAL; } static long snd_disconnect_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return -ENODEV; } static int snd_disconnect_mmap(struct file *file, struct vm_area_struct *vma) { return -ENODEV; } static int snd_disconnect_fasync(int fd, struct file *file, int on) { return -ENODEV; } static const struct file_operations snd_shutdown_f_ops = { .owner = THIS_MODULE, .llseek = snd_disconnect_llseek, .read = snd_disconnect_read, .write = snd_disconnect_write, .release = snd_disconnect_release, .poll = snd_disconnect_poll, .unlocked_ioctl = snd_disconnect_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = snd_disconnect_ioctl, #endif .mmap = snd_disconnect_mmap, .fasync = snd_disconnect_fasync }; /** * snd_card_disconnect - disconnect all APIs from the file-operations (user space) * @card: soundcard structure * * Disconnects all APIs from the file-operations (user space). * * Return: Zero, otherwise a negative error code. * * Note: The current implementation replaces all active file->f_op with special * dummy file operations (they do nothing except release). */ void snd_card_disconnect(struct snd_card *card) { struct snd_monitor_file *mfile; if (!card) return; scoped_guard(spinlock, &card->files_lock) { if (card->shutdown) return; card->shutdown = 1; /* replace file->f_op with special dummy operations */ list_for_each_entry(mfile, &card->files_list, list) { /* it's critical part, use endless loop */ /* we have no room to fail */ mfile->disconnected_f_op = mfile->file->f_op; scoped_guard(spinlock, &shutdown_lock) list_add(&mfile->shutdown_list, &shutdown_files); mfile->file->f_op = &snd_shutdown_f_ops; fops_get(mfile->file->f_op); } } #ifdef CONFIG_PM /* wake up sleepers here before other callbacks for avoiding potential * deadlocks with other locks (e.g. in kctls); * then this notifies the shutdown and sleepers would abort immediately */ wake_up_all(&card->power_sleep); #endif /* notify all connected devices about disconnection */ /* at this point, they cannot respond to any calls except release() */ #if IS_ENABLED(CONFIG_SND_MIXER_OSS) if (snd_mixer_oss_notify_callback) snd_mixer_oss_notify_callback(card, SND_MIXER_OSS_NOTIFY_DISCONNECT); #endif /* notify all devices that we are disconnected */ snd_device_disconnect_all(card); if (card->sync_irq > 0) synchronize_irq(card->sync_irq); snd_info_card_disconnect(card); #ifdef CONFIG_SND_DEBUG debugfs_remove(card->debugfs_root); card->debugfs_root = NULL; #endif if (card->registered) { device_del(&card->card_dev); card->registered = false; } /* disable fops (user space) operations for ALSA API */ scoped_guard(mutex, &snd_card_mutex) { snd_cards[card->number] = NULL; clear_bit(card->number, snd_cards_lock); } snd_power_sync_ref(card); } EXPORT_SYMBOL(snd_card_disconnect); /** * snd_card_disconnect_sync - disconnect card and wait until files get closed * @card: card object to disconnect * * This calls snd_card_disconnect() for disconnecting all belonging components * and waits until all pending files get closed. * It assures that all accesses from user-space finished so that the driver * can release its resources gracefully. */ void snd_card_disconnect_sync(struct snd_card *card) { snd_card_disconnect(card); guard(spinlock_irq)(&card->files_lock); wait_event_lock_irq(card->remove_sleep, list_empty(&card->files_list), card->files_lock); } EXPORT_SYMBOL_GPL(snd_card_disconnect_sync); static int snd_card_do_free(struct snd_card *card) { card->releasing = true; #if IS_ENABLED(CONFIG_SND_MIXER_OSS) if (snd_mixer_oss_notify_callback) snd_mixer_oss_notify_callback(card, SND_MIXER_OSS_NOTIFY_FREE); #endif snd_device_free_all(card); if (card->private_free) card->private_free(card); if (snd_info_card_free(card) < 0) { dev_warn(card->dev, "unable to free card info\n"); /* Not fatal error */ } if (card->release_completion) complete(card->release_completion); if (!card->managed) kfree(card); return 0; } /** * snd_card_free_when_closed - Disconnect the card, free it later eventually * @card: soundcard structure * * Unlike snd_card_free(), this function doesn't try to release the card * resource immediately, but tries to disconnect at first. When the card * is still in use, the function returns before freeing the resources. * The card resources will be freed when the refcount gets to zero. * * Return: zero if successful, or a negative error code */ void snd_card_free_when_closed(struct snd_card *card) { if (!card) return; snd_card_disconnect(card); put_device(&card->card_dev); return; } EXPORT_SYMBOL(snd_card_free_when_closed); /** * snd_card_free - frees given soundcard structure * @card: soundcard structure * * This function releases the soundcard structure and the all assigned * devices automatically. That is, you don't have to release the devices * by yourself. * * This function waits until the all resources are properly released. * * Return: Zero. Frees all associated devices and frees the control * interface associated to given soundcard. */ void snd_card_free(struct snd_card *card) { DECLARE_COMPLETION_ONSTACK(released); /* The call of snd_card_free() is allowed from various code paths; * a manual call from the driver and the call via devres_free, and * we need to avoid double-free. Moreover, the release via devres * may call snd_card_free() twice due to its nature, we need to have * the check here at the beginning. */ if (card->releasing) return; card->release_completion = &released; snd_card_free_when_closed(card); /* wait, until all devices are ready for the free operation */ wait_for_completion(&released); } EXPORT_SYMBOL(snd_card_free); /* check, if the character is in the valid ASCII range */ static inline bool safe_ascii_char(char c) { return isascii(c) && isalnum(c); } /* retrieve the last word of shortname or longname */ static const char *retrieve_id_from_card_name(const char *name) { const char *spos = name; while (*name) { if (isspace(*name) && safe_ascii_char(name[1])) spos = name + 1; name++; } return spos; } /* return true if the given id string doesn't conflict any other card ids */ static bool card_id_ok(struct snd_card *card, const char *id) { int i; if (!snd_info_check_reserved_words(id)) return false; for (i = 0; i < snd_ecards_limit; i++) { if (snd_cards[i] && snd_cards[i] != card && !strcmp(snd_cards[i]->id, id)) return false; } return true; } /* copy to card->id only with valid letters from nid */ static void copy_valid_id_string(struct snd_card *card, const char *src, const char *nid) { char *id = card->id; while (*nid && !safe_ascii_char(*nid)) nid++; if (isdigit(*nid)) *id++ = isalpha(*src) ? *src : 'D'; while (*nid && (size_t)(id - card->id) < sizeof(card->id) - 1) { if (safe_ascii_char(*nid)) *id++ = *nid; nid++; } *id = 0; } /* Set card->id from the given string * If the string conflicts with other ids, add a suffix to make it unique. */ static void snd_card_set_id_no_lock(struct snd_card *card, const char *src, const char *nid) { int len, loops; bool is_default = false; char *id; copy_valid_id_string(card, src, nid); id = card->id; again: /* use "Default" for obviously invalid strings * ("card" conflicts with proc directories) */ if (!*id || !strncmp(id, "card", 4)) { strscpy(card->id, "Default"); is_default = true; } len = strlen(id); for (loops = 0; loops < SNDRV_CARDS; loops++) { char sfxstr[5]; /* "_012" */ int sfxlen, slen; if (card_id_ok(card, id)) return; /* OK */ /* Add _XYZ suffix */ sfxlen = scnprintf(sfxstr, sizeof(sfxstr), "_%X", loops + 1); if (len + sfxlen >= sizeof(card->id)) slen = sizeof(card->id) - sfxlen - 1; else slen = len; strscpy(id + slen, sfxstr, sizeof(card->id) - slen); } /* fallback to the default id */ if (!is_default) { *id = 0; goto again; } /* last resort... */ dev_err(card->dev, "unable to set card id (%s)\n", id); if (card->proc_root->name) strscpy(card->id, card->proc_root->name, sizeof(card->id)); } /** * snd_card_set_id - set card identification name * @card: soundcard structure * @nid: new identification string * * This function sets the card identification and checks for name * collisions. */ void snd_card_set_id(struct snd_card *card, const char *nid) { /* check if user specified own card->id */ if (card->id[0] != '\0') return; guard(mutex)(&snd_card_mutex); snd_card_set_id_no_lock(card, nid, nid); } EXPORT_SYMBOL(snd_card_set_id); static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_card *card = container_of(dev, struct snd_card, card_dev); return sysfs_emit(buf, "%s\n", card->id); } static ssize_t id_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct snd_card *card = container_of(dev, struct snd_card, card_dev); char buf1[sizeof(card->id)]; size_t copy = count > sizeof(card->id) - 1 ? sizeof(card->id) - 1 : count; size_t idx; int c; for (idx = 0; idx < copy; idx++) { c = buf[idx]; if (!safe_ascii_char(c) && c != '_' && c != '-') return -EINVAL; } memcpy(buf1, buf, copy); buf1[copy] = '\0'; guard(mutex)(&snd_card_mutex); if (!card_id_ok(NULL, buf1)) return -EEXIST; strscpy(card->id, buf1); snd_info_card_id_change(card); return count; } static DEVICE_ATTR_RW(id); static ssize_t number_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_card *card = container_of(dev, struct snd_card, card_dev); return sysfs_emit(buf, "%i\n", card->number); } static DEVICE_ATTR_RO(number); static struct attribute *card_dev_attrs[] = { &dev_attr_id.attr, &dev_attr_number.attr, NULL }; static const struct attribute_group card_dev_attr_group = { .attrs = card_dev_attrs, }; /** * snd_card_add_dev_attr - Append a new sysfs attribute group to card * @card: card instance * @group: attribute group to append * * Return: zero if successful, or a negative error code */ int snd_card_add_dev_attr(struct snd_card *card, const struct attribute_group *group) { int i; /* loop for (arraysize-1) here to keep NULL at the last entry */ for (i = 0; i < ARRAY_SIZE(card->dev_groups) - 1; i++) { if (!card->dev_groups[i]) { card->dev_groups[i] = group; return 0; } } dev_err(card->dev, "Too many groups assigned\n"); return -ENOSPC; } EXPORT_SYMBOL_GPL(snd_card_add_dev_attr); static void trigger_card_free(void *data) { snd_card_free(data); } /** * snd_card_register - register the soundcard * @card: soundcard structure * * This function registers all the devices assigned to the soundcard. * Until calling this, the ALSA control interface is blocked from the * external accesses. Thus, you should call this function at the end * of the initialization of the card. * * Return: Zero otherwise a negative error code if the registration failed. */ int snd_card_register(struct snd_card *card) { int err; if (snd_BUG_ON(!card)) return -EINVAL; if (!card->registered) { err = device_add(&card->card_dev); if (err < 0) return err; card->registered = true; } else { if (card->managed) devm_remove_action(card->dev, trigger_card_free, card); } if (card->managed) { err = devm_add_action(card->dev, trigger_card_free, card); if (err < 0) return err; } err = snd_device_register_all(card); if (err < 0) return err; scoped_guard(mutex, &snd_card_mutex) { if (snd_cards[card->number]) { /* already registered */ return snd_info_card_register(card); /* register pending info */ } if (*card->id) { /* make a unique id name from the given string */ char tmpid[sizeof(card->id)]; memcpy(tmpid, card->id, sizeof(card->id)); snd_card_set_id_no_lock(card, tmpid, tmpid); } else { /* create an id from either shortname or longname */ const char *src; src = *card->shortname ? card->shortname : card->longname; snd_card_set_id_no_lock(card, src, retrieve_id_from_card_name(src)); } snd_cards[card->number] = card; } err = snd_info_card_register(card); if (err < 0) return err; #if IS_ENABLED(CONFIG_SND_MIXER_OSS) if (snd_mixer_oss_notify_callback) snd_mixer_oss_notify_callback(card, SND_MIXER_OSS_NOTIFY_REGISTER); #endif return 0; } EXPORT_SYMBOL(snd_card_register); #ifdef CONFIG_SND_PROC_FS static void snd_card_info_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int idx, count; struct snd_card *card; for (idx = count = 0; idx < SNDRV_CARDS; idx++) { guard(mutex)(&snd_card_mutex); card = snd_cards[idx]; if (card) { count++; snd_iprintf(buffer, "%2i [%-15s]: %s - %s\n", idx, card->id, card->driver, card->shortname); snd_iprintf(buffer, " %s\n", card->longname); } } if (!count) snd_iprintf(buffer, "--- no soundcards ---\n"); } #ifdef CONFIG_SND_OSSEMUL void snd_card_info_read_oss(struct snd_info_buffer *buffer) { int idx, count; struct snd_card *card; for (idx = count = 0; idx < SNDRV_CARDS; idx++) { guard(mutex)(&snd_card_mutex); card = snd_cards[idx]; if (card) { count++; snd_iprintf(buffer, "%s\n", card->longname); } } if (!count) { snd_iprintf(buffer, "--- no soundcards ---\n"); } } #endif #ifdef CONFIG_MODULES static void snd_card_module_info_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int idx; struct snd_card *card; for (idx = 0; idx < SNDRV_CARDS; idx++) { guard(mutex)(&snd_card_mutex); card = snd_cards[idx]; if (card) snd_iprintf(buffer, "%2i %s\n", idx, card->module->name); } } #endif int __init snd_card_info_init(void) { struct snd_info_entry *entry; entry = snd_info_create_module_entry(THIS_MODULE, "cards", NULL); if (! entry) return -ENOMEM; entry->c.text.read = snd_card_info_read; if (snd_info_register(entry) < 0) return -ENOMEM; /* freed in error path */ #ifdef CONFIG_MODULES entry = snd_info_create_module_entry(THIS_MODULE, "modules", NULL); if (!entry) return -ENOMEM; entry->c.text.read = snd_card_module_info_read; if (snd_info_register(entry) < 0) return -ENOMEM; /* freed in error path */ #endif return 0; } #endif /* CONFIG_SND_PROC_FS */ /** * snd_component_add - add a component string * @card: soundcard structure * @component: the component id string * * This function adds the component id string to the supported list. * The component can be referred from the alsa-lib. * * Return: Zero otherwise a negative error code. */ int snd_component_add(struct snd_card *card, const char *component) { char *ptr; int len = strlen(component); ptr = strstr(card->components, component); if (ptr != NULL) { if (ptr[len] == '\0' || ptr[len] == ' ') /* already there */ return 1; } if (strlen(card->components) + 1 + len + 1 > sizeof(card->components)) { snd_BUG(); return -ENOMEM; } if (card->components[0] != '\0') strcat(card->components, " "); strcat(card->components, component); return 0; } EXPORT_SYMBOL(snd_component_add); /** * snd_card_file_add - add the file to the file list of the card * @card: soundcard structure * @file: file pointer * * This function adds the file to the file linked-list of the card. * This linked-list is used to keep tracking the connection state, * and to avoid the release of busy resources by hotplug. * * Return: zero or a negative error code. */ int snd_card_file_add(struct snd_card *card, struct file *file) { struct snd_monitor_file *mfile; mfile = kmalloc(sizeof(*mfile), GFP_KERNEL); if (mfile == NULL) return -ENOMEM; mfile->file = file; mfile->disconnected_f_op = NULL; INIT_LIST_HEAD(&mfile->shutdown_list); guard(spinlock)(&card->files_lock); if (card->shutdown) { kfree(mfile); return -ENODEV; } list_add(&mfile->list, &card->files_list); get_device(&card->card_dev); return 0; } EXPORT_SYMBOL(snd_card_file_add); /** * snd_card_file_remove - remove the file from the file list * @card: soundcard structure * @file: file pointer * * This function removes the file formerly added to the card via * snd_card_file_add() function. * If all files are removed and snd_card_free_when_closed() was * called beforehand, it processes the pending release of * resources. * * Return: Zero or a negative error code. */ int snd_card_file_remove(struct snd_card *card, struct file *file) { struct snd_monitor_file *mfile, *found = NULL; scoped_guard(spinlock, &card->files_lock) { list_for_each_entry(mfile, &card->files_list, list) { if (mfile->file == file) { list_del(&mfile->list); scoped_guard(spinlock, &shutdown_lock) list_del(&mfile->shutdown_list); if (mfile->disconnected_f_op) fops_put(mfile->disconnected_f_op); found = mfile; break; } } if (list_empty(&card->files_list)) wake_up_all(&card->remove_sleep); } if (!found) { dev_err(card->dev, "card file remove problem (%p)\n", file); return -ENOENT; } kfree(found); put_device(&card->card_dev); return 0; } EXPORT_SYMBOL(snd_card_file_remove); #ifdef CONFIG_PM /** * snd_power_ref_and_wait - wait until the card gets powered up * @card: soundcard structure * * Take the power_ref reference count of the given card, and * wait until the card gets powered up to SNDRV_CTL_POWER_D0 state. * The refcount is down again while sleeping until power-up, hence this * function can be used for syncing the floating control ops accesses, * typically around calling control ops. * * The caller needs to pull down the refcount via snd_power_unref() later * no matter whether the error is returned from this function or not. * * Return: Zero if successful, or a negative error code. */ int snd_power_ref_and_wait(struct snd_card *card) { snd_power_ref(card); if (snd_power_get_state(card) == SNDRV_CTL_POWER_D0) return 0; wait_event_cmd(card->power_sleep, card->shutdown || snd_power_get_state(card) == SNDRV_CTL_POWER_D0, snd_power_unref(card), snd_power_ref(card)); return card->shutdown ? -ENODEV : 0; } EXPORT_SYMBOL_GPL(snd_power_ref_and_wait); /** * snd_power_wait - wait until the card gets powered up (old form) * @card: soundcard structure * * Wait until the card gets powered up to SNDRV_CTL_POWER_D0 state. * * Return: Zero if successful, or a negative error code. */ int snd_power_wait(struct snd_card *card) { int ret; ret = snd_power_ref_and_wait(card); snd_power_unref(card); return ret; } EXPORT_SYMBOL(snd_power_wait); #endif /* CONFIG_PM */
105 41 34 31 31 5101 31 4204 30 4 92 348 32 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for diskquota-operations. When diskquota is configured these * macros expand to the right source-code. * * Author: Marco van Wieringen <mvw@planets.elm.net> */ #ifndef _LINUX_QUOTAOPS_ #define _LINUX_QUOTAOPS_ #include <linux/fs.h> #define DQUOT_SPACE_WARN 0x1 #define DQUOT_SPACE_RESERVE 0x2 #define DQUOT_SPACE_NOFAIL 0x4 static inline struct quota_info *sb_dqopt(struct super_block *sb) { return &sb->s_dquot; } /* i_rwsem must being held */ static inline bool is_quota_modification(struct mnt_idmap *idmap, struct inode *inode, struct iattr *ia) { return ((ia->ia_valid & ATTR_SIZE) || i_uid_needs_update(idmap, ia, inode) || i_gid_needs_update(idmap, ia, inode)); } #if defined(CONFIG_QUOTA) #define quota_error(sb, fmt, args...) \ __quota_error((sb), __func__, fmt , ## args) extern __printf(3, 4) void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...); /* * declaration of quota_function calls in kernel. */ int dquot_initialize(struct inode *inode); bool dquot_initialize_needed(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, struct kqid qid); static inline struct dquot *dqgrab(struct dquot *dquot) { /* Make sure someone else has active reference to dquot */ WARN_ON_ONCE(!atomic_read(&dquot->dq_count)); WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); atomic_inc(&dquot->dq_count); return dquot; } static inline bool dquot_is_busy(struct dquot *dquot) { if (test_bit(DQ_MOD_B, &dquot->dq_flags)) return true; if (atomic_read(&dquot->dq_count) > 0) return true; return false; } void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), unsigned long priv); struct dquot *dquot_alloc(struct super_block *sb, int type); void dquot_destroy(struct dquot *dquot); int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags); void __dquot_free_space(struct inode *inode, qsize_t number, int flags); int dquot_alloc_inode(struct inode *inode); void dquot_claim_space_nodirty(struct inode *inode, qsize_t number); void dquot_free_inode(struct inode *inode); void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number); int dquot_disable(struct super_block *sb, int type, unsigned int flags); /* Suspend quotas on remount RO */ static inline int dquot_suspend(struct super_block *sb, int type) { return dquot_disable(sb, type, DQUOT_SUSPENDED); } int dquot_resume(struct super_block *sb, int type); int dquot_commit(struct dquot *dquot); int dquot_acquire(struct dquot *dquot); int dquot_release(struct dquot *dquot); int dquot_commit_info(struct super_block *sb, int type); int dquot_get_next_id(struct super_block *sb, struct kqid *qid); int dquot_mark_dquot_dirty(struct dquot *dquot); int dquot_file_open(struct inode *inode, struct file *file); int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags); int dquot_load_quota_inode(struct inode *inode, int type, int format_id, unsigned int flags); int dquot_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type); int dquot_quota_off(struct super_block *sb, int type); int dquot_writeback_dquots(struct super_block *sb, int type); int dquot_quota_sync(struct super_block *sb, int type); int dquot_get_state(struct super_block *sb, struct qc_state *state); int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii); int dquot_get_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id, struct qc_dqblk *di); int dquot_set_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int __dquot_transfer(struct inode *inode, struct dquot **transfer_to); int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr); static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) { return sb_dqopt(sb)->info + type; } /* * Functions for checking status of quota */ static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_USAGE_ENABLED, type); } static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_LIMITS_ENABLED, type); } static inline bool sb_has_quota_suspended(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_SUSPENDED, type); } static inline unsigned sb_any_quota_suspended(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_SUSPENDED); } /* Does kernel know about any quota information for given sb + type? */ static inline bool sb_has_quota_loaded(struct super_block *sb, int type) { /* Currently if anything is on, then quota usage is on as well */ return sb_has_quota_usage_enabled(sb, type); } static inline unsigned sb_any_quota_loaded(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_USAGE_ENABLED); } static inline bool sb_has_quota_active(struct super_block *sb, int type) { return sb_has_quota_loaded(sb, type) && !sb_has_quota_suspended(sb, type); } /* * Operations supported for diskquotas. */ extern const struct dquot_operations dquot_operations; extern const struct quotactl_ops dquot_quotactl_sysfile_ops; #else static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_suspended(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_suspended(struct super_block *sb) { return 0; } /* Does kernel know about any quota information for given sb + type? */ static inline int sb_has_quota_loaded(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_loaded(struct super_block *sb) { return 0; } static inline int sb_has_quota_active(struct super_block *sb, int type) { return 0; } static inline int dquot_initialize(struct inode *inode) { return 0; } static inline bool dquot_initialize_needed(struct inode *inode) { return false; } static inline void dquot_drop(struct inode *inode) { } static inline int dquot_alloc_inode(struct inode *inode) { return 0; } static inline void dquot_free_inode(struct inode *inode) { } static inline int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr) { return 0; } static inline int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_add_bytes(inode, number); return 0; } static inline void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_sub_bytes(inode, number); } static inline void dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { inode_add_bytes(inode, number); } static inline int dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) { inode_sub_bytes(inode, number); return 0; } static inline int dquot_disable(struct super_block *sb, int type, unsigned int flags) { return 0; } static inline int dquot_suspend(struct super_block *sb, int type) { return 0; } static inline int dquot_resume(struct super_block *sb, int type) { return 0; } #define dquot_file_open generic_file_open static inline int dquot_writeback_dquots(struct super_block *sb, int type) { return 0; } #endif /* CONFIG_QUOTA */ static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN); } static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr) { __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL); mark_inode_dirty_sync(inode); } static inline int dquot_alloc_space(struct inode *inode, qsize_t nr) { int ret; ret = dquot_alloc_space_nodirty(inode, nr); if (!ret) { /* * Mark inode fully dirty. Since we are allocating blocks, inode * would become fully dirty soon anyway and it reportedly * reduces lock contention. */ mark_inode_dirty(inode); } return ret; } static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr) { return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_alloc_block_nofail(struct inode *inode, qsize_t nr) { dquot_alloc_space_nofail(inode, nr << inode->i_blkbits); } static inline int dquot_alloc_block(struct inode *inode, qsize_t nr) { return dquot_alloc_space(inode, nr << inode->i_blkbits); } static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0); } static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr) { int ret; ret = dquot_prealloc_block_nodirty(inode, nr); if (!ret) mark_inode_dirty_sync(inode); return ret; } static inline int dquot_reserve_block(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE); } static inline void dquot_claim_block(struct inode *inode, qsize_t nr) { dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); mark_inode_dirty_sync(inode); } static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr) { dquot_reclaim_space_nodirty(inode, nr << inode->i_blkbits); mark_inode_dirty_sync(inode); } static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr, 0); } static inline void dquot_free_space(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr); mark_inode_dirty_sync(inode); } static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_free_block(struct inode *inode, qsize_t nr) { dquot_free_space(inode, nr << inode->i_blkbits); } static inline void dquot_release_reservation_block(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE); } unsigned int qtype_enforce_flag(int type); #endif /* _LINUX_QUOTAOPS_ */
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/spinlock.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/init.h> static LIST_HEAD(ax25_dev_list); DEFINE_SPINLOCK(ax25_dev_lock); ax25_dev *ax25_addr_ax25dev(ax25_address *addr) { ax25_dev *ax25_dev, *res = NULL; spin_lock_bh(&ax25_dev_lock); list_for_each_entry(ax25_dev, &ax25_dev_list, list) if (ax25cmp(addr, (const ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; ax25_dev_hold(ax25_dev); break; } spin_unlock_bh(&ax25_dev_lock); return res; } /* * This is called when an interface is brought up. These are * reasonable defaults. */ void ax25_dev_device_up(struct net_device *dev) { ax25_dev *ax25_dev; ax25_dev = kzalloc(sizeof(*ax25_dev), GFP_KERNEL); if (!ax25_dev) { printk(KERN_ERR "AX.25: ax25_dev_device_up - out of memory\n"); return; } refcount_set(&ax25_dev->refcount, 1); ax25_dev->dev = dev; netdev_hold(dev, &ax25_dev->dev_tracker, GFP_KERNEL); ax25_dev->forward = NULL; ax25_dev->device_up = true; ax25_dev->values[AX25_VALUES_IPDEFMODE] = AX25_DEF_IPDEFMODE; ax25_dev->values[AX25_VALUES_AXDEFMODE] = AX25_DEF_AXDEFMODE; ax25_dev->values[AX25_VALUES_BACKOFF] = AX25_DEF_BACKOFF; ax25_dev->values[AX25_VALUES_CONMODE] = AX25_DEF_CONMODE; ax25_dev->values[AX25_VALUES_WINDOW] = AX25_DEF_WINDOW; ax25_dev->values[AX25_VALUES_EWINDOW] = AX25_DEF_EWINDOW; ax25_dev->values[AX25_VALUES_T1] = AX25_DEF_T1; ax25_dev->values[AX25_VALUES_T2] = AX25_DEF_T2; ax25_dev->values[AX25_VALUES_T3] = AX25_DEF_T3; ax25_dev->values[AX25_VALUES_IDLE] = AX25_DEF_IDLE; ax25_dev->values[AX25_VALUES_N2] = AX25_DEF_N2; ax25_dev->values[AX25_VALUES_PACLEN] = AX25_DEF_PACLEN; ax25_dev->values[AX25_VALUES_PROTOCOL] = AX25_DEF_PROTOCOL; #ifdef CONFIG_AX25_DAMA_SLAVE ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT; #endif #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_ds_setup_timer(ax25_dev); #endif spin_lock_bh(&ax25_dev_lock); list_add(&ax25_dev->list, &ax25_dev_list); rcu_assign_pointer(dev->ax25_ptr, ax25_dev); spin_unlock_bh(&ax25_dev_lock); ax25_register_dev_sysctl(ax25_dev); } void ax25_dev_device_down(struct net_device *dev) { ax25_dev *s, *ax25_dev; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return; ax25_unregister_dev_sysctl(ax25_dev); spin_lock_bh(&ax25_dev_lock); #ifdef CONFIG_AX25_DAMA_SLAVE timer_shutdown_sync(&ax25_dev->dama.slave_timer); #endif /* * Remove any packet forwarding that points to this device. */ list_for_each_entry(s, &ax25_dev_list, list) if (s->forward == dev) s->forward = NULL; list_for_each_entry(s, &ax25_dev_list, list) { if (s == ax25_dev) { list_del(&s->list); break; } } RCU_INIT_POINTER(dev->ax25_ptr, NULL); spin_unlock_bh(&ax25_dev_lock); netdev_put(dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); } int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) { ax25_dev *ax25_dev, *fwd_dev; if ((ax25_dev = ax25_addr_ax25dev(&fwd->port_from)) == NULL) return -EINVAL; switch (cmd) { case SIOCAX25ADDFWD: fwd_dev = ax25_addr_ax25dev(&fwd->port_to); if (!fwd_dev) { ax25_dev_put(ax25_dev); return -EINVAL; } if (ax25_dev->forward) { ax25_dev_put(fwd_dev); ax25_dev_put(ax25_dev); return -EINVAL; } ax25_dev->forward = fwd_dev->dev; ax25_dev_put(fwd_dev); ax25_dev_put(ax25_dev); break; case SIOCAX25DELFWD: if (!ax25_dev->forward) { ax25_dev_put(ax25_dev); return -EINVAL; } ax25_dev->forward = NULL; ax25_dev_put(ax25_dev); break; default: ax25_dev_put(ax25_dev); return -EINVAL; } return 0; } struct net_device *ax25_fwd_dev(struct net_device *dev) { ax25_dev *ax25_dev; if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) return dev; if (ax25_dev->forward == NULL) return dev; return ax25_dev->forward; } /* * Free all memory associated with device structures. */ void __exit ax25_dev_free(void) { ax25_dev *s, *n; spin_lock_bh(&ax25_dev_lock); list_for_each_entry_safe(s, n, &ax25_dev_list, list) { netdev_put(s->dev, &s->dev_tracker); list_del(&s->list); ax25_dev_put(s); } spin_unlock_bh(&ax25_dev_lock); }
8 8 7 1 8 7 8 1 8 7 1 1 126 127 127 127 8 8 7 7 7 7 130 130 44 8 8 8 8 5 8 8 8 8 8 1 1 1 127 127 87 86 87 87 46 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/rculist.h> #include <linux/spinlock.h> #include "protocol.h" #include "mib.h" #define ADD_ADDR_RETRANS_MAX 3 struct mptcp_pm_add_entry { struct list_head list; struct mptcp_addr_info addr; u8 retrans_times; struct timer_list add_timer; struct mptcp_sock *sock; struct rcu_head rcu; }; static DEFINE_SPINLOCK(mptcp_pm_list_lock); static LIST_HEAD(mptcp_pm_list); /* path manager helpers */ /* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses, * otherwise allow any matching local/remote pair */ bool mptcp_pm_addr_families_match(const struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *rem) { bool mptcp_is_v4 = sk->sk_family == AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6); bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6); if (mptcp_is_v4) return loc_is_v4 && rem_is_v4; if (ipv6_only_sock(sk)) return !loc_is_v4 && !rem_is_v4; return loc_is_v4 == rem_is_v4; #else return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET; #endif } bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port) { bool addr_equals = false; if (a->family == b->family) { if (a->family == AF_INET) addr_equals = a->addr.s_addr == b->addr.s_addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else addr_equals = ipv6_addr_equal(&a->addr6, &b->addr6); } else if (a->family == AF_INET) { if (ipv6_addr_v4mapped(&b->addr6)) addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; } else if (b->family == AF_INET) { if (ipv6_addr_v4mapped(&a->addr6)) addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr; #endif } if (!addr_equals) return false; if (!use_port) return true; return a->port == b->port; } void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr) { addr->family = skc->skc_family; addr->port = htons(skc->skc_num); if (addr->family == AF_INET) addr->addr.s_addr = skc->skc_rcv_saddr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) addr->addr6 = skc->skc_v6_rcv_saddr; #endif } void mptcp_remote_address(const struct sock_common *skc, struct mptcp_addr_info *addr) { addr->family = skc->skc_family; addr->port = skc->skc_dport; if (addr->family == AF_INET) addr->addr.s_addr = skc->skc_daddr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) addr->addr6 = skc->skc_v6_daddr; #endif } static bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *remote) { struct mptcp_addr_info mpc_remote; mptcp_remote_address((struct sock_common *)msk, &mpc_remote); return mptcp_addresses_equal(&mpc_remote, remote, remote->port); } bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, const struct mptcp_addr_info *saddr) { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; struct sock_common *skc; list_for_each_entry(subflow, list, node) { skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); mptcp_local_address(skc, &cur); if (mptcp_addresses_equal(&cur, saddr, saddr->port)) return true; } return false; } static struct mptcp_pm_add_entry * mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; lockdep_assert_held(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { if (mptcp_addresses_equal(&entry->addr, addr, true)) return entry; } return NULL; } bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; bool ret; entry = mptcp_pm_del_add_timer(msk, addr, false); ret = entry; kfree_rcu(entry, rcu); return ret; } bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) { struct mptcp_pm_add_entry *entry; struct mptcp_addr_info saddr; bool ret = false; mptcp_local_address((struct sock_common *)sk, &saddr); spin_lock_bh(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { ret = true; goto out; } } out: spin_unlock_bh(&msk->pm.lock); return ret; } static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, bool prio, bool backup) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow; pr_debug("send ack for %s\n", prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); slow = lock_sock_fast(ssk); if (prio) { subflow->send_mp_prio = 1; subflow->request_bkup = backup; } __mptcp_subflow_send_ack(ssk); unlock_sock_fast(ssk, slow); } void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, bool prio, bool backup) { spin_unlock_bh(&msk->pm.lock); __mptcp_pm_send_ack(msk, subflow, prio, backup); spin_lock_bh(&msk->pm.lock); } void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *alt = NULL; msk_owned_by_me(msk); lockdep_assert_held(&msk->pm.lock); if (!mptcp_pm_should_add_signal(msk) && !mptcp_pm_should_rm_signal(msk)) return; mptcp_for_each_subflow(msk, subflow) { if (__mptcp_subflow_active(subflow)) { if (!subflow->stale) { mptcp_pm_send_ack(msk, subflow, false, false); return; } if (!alt) alt = subflow; } } if (alt) mptcp_pm_send_ack(msk, alt, false, false); } int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *addr, struct mptcp_addr_info *rem, u8 bkup) { struct mptcp_subflow_context *subflow; pr_debug("bkup=%d\n", bkup); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct mptcp_addr_info local, remote; mptcp_local_address((struct sock_common *)ssk, &local); if (!mptcp_addresses_equal(&local, addr, addr->port)) continue; if (rem && rem->family != AF_UNSPEC) { mptcp_remote_address((struct sock_common *)ssk, &remote); if (!mptcp_addresses_equal(&remote, rem, rem->port)) continue; } __mptcp_pm_send_ack(msk, subflow, true, bkup); return 0; } return -EINVAL; } static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk) { const struct net *net = sock_net((struct sock *)msk); unsigned int rto = mptcp_get_add_addr_timeout(net); struct mptcp_subflow_context *subflow; unsigned int max = 0; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct inet_connection_sock *icsk = inet_csk(ssk); if (icsk->icsk_rto > max) max = icsk->icsk_rto; } if (max && max < rto) rto = max; return rto; } static void mptcp_pm_add_timer(struct timer_list *timer) { struct mptcp_pm_add_entry *entry = timer_container_of(entry, timer, add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; unsigned int timeout; pr_debug("msk=%p\n", msk); if (!msk) return; if (inet_sk_state_load(sk) == TCP_CLOSE) return; if (!entry->addr.id) return; if (mptcp_pm_should_add_signal_addr(msk)) { sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); goto out; } timeout = mptcp_adjust_add_addr_timeout(msk); if (!timeout) goto out; spin_lock_bh(&msk->pm.lock); if (!mptcp_pm_should_add_signal_addr(msk)) { pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); mptcp_pm_announce_addr(msk, &entry->addr, false); mptcp_pm_add_addr_send_ack(msk); entry->retrans_times++; } if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) sk_reset_timer(sk, timer, jiffies + (timeout << entry->retrans_times)); spin_unlock_bh(&msk->pm.lock); if (entry->retrans_times == ADD_ADDR_RETRANS_MAX) mptcp_pm_subflow_established(msk); out: __sock_put(sk); } struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool check_id) { struct mptcp_pm_add_entry *entry; struct sock *sk = (struct sock *)msk; bool stop_timer = false; rcu_read_lock(); spin_lock_bh(&msk->pm.lock); entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (entry && (!check_id || entry->addr.id == addr->id)) { entry->retrans_times = ADD_ADDR_RETRANS_MAX; stop_timer = true; } if (!check_id && entry) list_del(&entry->list); spin_unlock_bh(&msk->pm.lock); /* Note: entry might have been removed by another thread. * We hold rcu_read_lock() to ensure it is not freed under us. */ if (stop_timer) sk_stop_timer_sync(sk, &entry->add_timer); rcu_read_unlock(); return entry; } bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; unsigned int timeout; lockdep_assert_held(&msk->pm.lock); add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (add_entry) { if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) return false; goto reset_timer; } add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); if (!add_entry) return false; list_add(&add_entry->list, &msk->pm.anno_list); add_entry->addr = *addr; add_entry->sock = msk; add_entry->retrans_times = 0; timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); reset_timer: timeout = mptcp_adjust_add_addr_timeout(msk); if (timeout) sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout); return true; } static void mptcp_pm_free_anno_list(struct mptcp_sock *msk) { struct mptcp_pm_add_entry *entry, *tmp; struct sock *sk = (struct sock *)msk; LIST_HEAD(free_list); pr_debug("msk=%p\n", msk); spin_lock_bh(&msk->pm.lock); list_splice_init(&msk->pm.anno_list, &free_list); spin_unlock_bh(&msk->pm.lock); list_for_each_entry_safe(entry, tmp, &free_list, list) { sk_stop_timer_sync(sk, &entry->add_timer); kfree_rcu(entry, rcu); } } /* path manager command handlers */ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo) { u8 add_addr = READ_ONCE(msk->pm.addr_signal); pr_debug("msk=%p, local_id=%d, echo=%d\n", msk, addr->id, echo); lockdep_assert_held(&msk->pm.lock); if (add_addr & (echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) { MPTCP_INC_STATS(sock_net((struct sock *)msk), echo ? MPTCP_MIB_ECHOADDTXDROP : MPTCP_MIB_ADDADDRTXDROP); return -EINVAL; } if (echo) { msk->pm.remote = *addr; add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); } else { msk->pm.local = *addr; add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); } WRITE_ONCE(msk->pm.addr_signal, add_addr); return 0; } int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { u8 rm_addr = READ_ONCE(msk->pm.addr_signal); pr_debug("msk=%p, rm_list_nr=%d\n", msk, rm_list->nr); if (rm_addr) { MPTCP_ADD_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRTXDROP, rm_list->nr); return -EINVAL; } msk->pm.rm_list_tx = *rm_list; rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); WRITE_ONCE(msk->pm.addr_signal, rm_addr); mptcp_pm_addr_send_ack(msk); return 0; } /* path manager event handlers */ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side) { struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p, token=%u side=%d\n", msk, READ_ONCE(msk->token), server_side); WRITE_ONCE(pm->server_side, server_side); mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC); } bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; unsigned int limit_extra_subflows; int ret = 0; if (mptcp_pm_is_userspace(msk)) { if (mptcp_userspace_pm_active(msk)) { spin_lock_bh(&pm->lock); pm->extra_subflows++; spin_unlock_bh(&pm->lock); return true; } return false; } limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->extra_subflows, limit_extra_subflows, READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ if (!READ_ONCE(pm->accept_subflow)) return false; spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { ret = pm->extra_subflows < limit_extra_subflows; if (ret && ++pm->extra_subflows == limit_extra_subflows) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); return ret; } /* return true if the new status bit is currently cleared, that is, this event * can be server, eventually by an already scheduled work */ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, enum mptcp_pm_status new_status) { pr_debug("msk=%p status=%x new=%lx\n", msk, msk->pm.status, BIT(new_status)); if (msk->pm.status & BIT(new_status)) return false; msk->pm.status |= BIT(new_status); mptcp_schedule_work((struct sock *)msk); return true; } void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) { struct mptcp_pm_data *pm = &msk->pm; bool announce = false; pr_debug("msk=%p\n", msk); spin_lock_bh(&pm->lock); /* mptcp_pm_fully_established() can be invoked by multiple * racing paths - accept() and check_fully_established() * be sure to serve this event only once. */ if (READ_ONCE(pm->work_pending) && !(pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED))) mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); if ((pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0) announce = true; pm->status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); spin_unlock_bh(&pm->lock); if (announce) mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, GFP_ATOMIC); } void mptcp_pm_connection_closed(struct mptcp_sock *msk) { pr_debug("msk=%p\n", msk); if (msk->token) mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); } void mptcp_pm_subflow_established(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p\n", msk); if (!READ_ONCE(pm->work_pending)) return; spin_lock_bh(&pm->lock); if (READ_ONCE(pm->work_pending)) mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); spin_unlock_bh(&pm->lock); } void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct mptcp_subflow_context *subflow) { struct sock *sk = (struct sock *)msk; struct mptcp_pm_data *pm = &msk->pm; bool update_subflows; update_subflows = subflow->request_join || subflow->mp_join; if (mptcp_pm_is_userspace(msk)) { if (update_subflows) { spin_lock_bh(&pm->lock); pm->extra_subflows--; spin_unlock_bh(&pm->lock); } return; } if (!READ_ONCE(pm->work_pending) && !update_subflows) return; spin_lock_bh(&pm->lock); if (update_subflows) __mptcp_pm_close_subflow(msk); /* Even if this subflow is not really established, tell the PM to try * to pick the next ones, if possible. */ if (mptcp_is_fully_established(sk) && mptcp_pm_nl_check_work_pending(msk)) mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); spin_unlock_bh(&pm->lock); } void mptcp_pm_add_addr_received(const struct sock *ssk, const struct mptcp_addr_info *addr) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p remote_id=%d accept=%d\n", msk, addr->id, READ_ONCE(pm->accept_addr)); mptcp_event_addr_announced(ssk, addr); spin_lock_bh(&pm->lock); if (mptcp_pm_is_userspace(msk)) { if (mptcp_userspace_pm_active(msk)) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else { __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } /* - id0 should not have a different address * - special case for C-flag: linked to fill_local_addresses_vec() */ } else if ((addr->id == 0 && !mptcp_pm_is_init_remote_addr(msk, addr)) || (addr->id > 0 && !READ_ONCE(pm->accept_addr) && !mptcp_pm_add_addr_c_flag_case(msk))) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; } else { __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } spin_unlock_bh(&pm->lock); } void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p\n", msk); if (!READ_ONCE(pm->work_pending)) return; spin_lock_bh(&pm->lock); if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending)) mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); spin_unlock_bh(&pm->lock); } void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) { if (!mptcp_pm_should_add_signal(msk)) return; mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); } static void mptcp_pm_rm_addr_or_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list, enum linux_mptcp_mib_field rm_type) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; u8 i; pr_debug("%s rm_list_nr %d\n", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); msk_owned_by_me(msk); if (sk->sk_state == TCP_LISTEN) return; if (!rm_list->nr) return; if (list_empty(&msk->conn_list)) return; for (i = 0; i < rm_list->nr; i++) { u8 rm_id = rm_list->ids[i]; bool removed = false; mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); u8 remote_id = READ_ONCE(subflow->remote_id); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; u8 id = subflow_get_local_id(subflow); if ((1 << inet_sk_state_load(ssk)) & (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE)) continue; if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) continue; if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id) continue; pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", i, rm_id, id, remote_id, msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); removed |= subflow->request_join; /* the following takes care of updating the subflows counter */ mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); if (rm_type == MPTCP_MIB_RMSUBFLOW) __MPTCP_INC_STATS(sock_net(sk), rm_type); } if (rm_type == MPTCP_MIB_RMADDR) { __MPTCP_INC_STATS(sock_net(sk), rm_type); if (removed && mptcp_pm_is_kernel(msk)) mptcp_pm_nl_rm_addr(msk, rm_id); } } } static void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk) { mptcp_pm_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); } void mptcp_pm_rm_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { mptcp_pm_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); } void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { struct mptcp_pm_data *pm = &msk->pm; u8 i; pr_debug("msk=%p remote_ids_nr=%d\n", msk, rm_list->nr); for (i = 0; i < rm_list->nr; i++) mptcp_event_addr_removed(msk, rm_list->ids[i]); spin_lock_bh(&pm->lock); if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED)) pm->rm_list_rx = *rm_list; else __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP); spin_unlock_bh(&pm->lock); } void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; struct mptcp_sock *msk; pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); msk = mptcp_sk(sk); if (subflow->backup != bkup) subflow->backup = bkup; mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC); } void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); pr_debug("fail_seq=%llu\n", fail_seq); /* After accepting the fail, we can't create any other subflows */ spin_lock_bh(&msk->fallback_lock); if (!msk->allow_infinite_fallback) { spin_unlock_bh(&msk->fallback_lock); return; } msk->allow_subflows = false; spin_unlock_bh(&msk->fallback_lock); if (!subflow->fail_tout) { pr_debug("send MP_FAIL response and infinite map\n"); subflow->send_mp_fail = 1; subflow->send_infinite_map = 1; tcp_send_ack(sk); } else { pr_debug("MP_FAIL response received\n"); WRITE_ONCE(subflow->fail_tout, 0); } } bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, bool *drop_other_suboptions) { int ret = false; u8 add_addr; u8 family; bool port; spin_lock_bh(&msk->pm.lock); /* double check after the lock is acquired */ if (!mptcp_pm_should_add_signal(msk)) goto out_unlock; /* always drop every other options for pure ack ADD_ADDR; this is a * plain dup-ack from TCP perspective. The other MPTCP-relevant info, * if any, will be carried by the 'original' TCP ack */ if (skb && skb_is_tcp_pure_ack(skb)) { remaining += opt_size; *drop_other_suboptions = true; } *echo = mptcp_pm_should_add_signal_echo(msk); port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port); family = *echo ? msk->pm.remote.family : msk->pm.local.family; if (remaining < mptcp_add_addr_len(family, *echo, port)) goto out_unlock; if (*echo) { *addr = msk->pm.remote; add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO); } else { *addr = msk->pm.local; add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL); } WRITE_ONCE(msk->pm.addr_signal, add_addr); ret = true; out_unlock: spin_unlock_bh(&msk->pm.lock); return ret; } bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list) { int ret = false, len; u8 rm_addr; spin_lock_bh(&msk->pm.lock); /* double check after the lock is acquired */ if (!mptcp_pm_should_rm_signal(msk)) goto out_unlock; rm_addr = msk->pm.addr_signal & ~BIT(MPTCP_RM_ADDR_SIGNAL); len = mptcp_rm_addr_len(&msk->pm.rm_list_tx); if (len < 0) { WRITE_ONCE(msk->pm.addr_signal, rm_addr); goto out_unlock; } if (remaining < len) goto out_unlock; *rm_list = msk->pm.rm_list_tx; WRITE_ONCE(msk->pm.addr_signal, rm_addr); ret = true; out_unlock: spin_unlock_bh(&msk->pm.lock); return ret; } int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) { struct mptcp_pm_addr_entry skc_local = { 0 }; struct mptcp_addr_info msk_local; if (WARN_ON_ONCE(!msk)) return -1; /* The 0 ID mapping is defined by the first subflow, copied into the msk * addr */ mptcp_local_address((struct sock_common *)msk, &msk_local); mptcp_local_address((struct sock_common *)skc, &skc_local.addr); if (mptcp_addresses_equal(&msk_local, &skc_local.addr, false)) return 0; skc_local.addr.id = 0; skc_local.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; if (mptcp_pm_is_userspace(msk)) return mptcp_userspace_pm_get_local_id(msk, &skc_local); return mptcp_pm_nl_get_local_id(msk, &skc_local); } bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc) { struct mptcp_addr_info skc_local; mptcp_local_address((struct sock_common *)skc, &skc_local); if (mptcp_pm_is_userspace(msk)) return mptcp_userspace_pm_is_backup(msk, &skc_local); return mptcp_pm_nl_is_backup(msk, &skc_local); } static void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; unsigned int active_max_loss_cnt; struct net *net = sock_net(sk); unsigned int stale_loss_cnt; bool slow; stale_loss_cnt = mptcp_stale_loss_cnt(net); if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt) return; /* look for another available subflow not in loss state */ active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1); mptcp_for_each_subflow(msk, iter) { if (iter != subflow && mptcp_subflow_active(iter) && iter->stale_count < active_max_loss_cnt) { /* we have some alternatives, try to mark this subflow as idle ...*/ slow = lock_sock_fast(ssk); if (!tcp_rtx_and_write_queues_empty(ssk)) { subflow->stale = 1; __mptcp_retransmit_pending_data(sk); MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); } unlock_sock_fast(ssk, slow); /* always try to push the pending data regardless of re-injections: * we can possibly use backup subflows now, and subflow selection * is cheap under the msk socket lock */ __mptcp_push_pending(sk, 0); return; } } } void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp); /* keep track of rtx periods with no progress */ if (!subflow->stale_count) { subflow->stale_rcv_tstamp = rcv_tstamp; subflow->stale_count++; } else if (subflow->stale_rcv_tstamp == rcv_tstamp) { if (subflow->stale_count < U8_MAX) subflow->stale_count++; mptcp_pm_subflows_chk_stale(msk, ssk); } else { subflow->stale_count = 0; mptcp_subflow_set_active(subflow); } } void mptcp_pm_worker(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; msk_owned_by_me(msk); if (!(pm->status & MPTCP_PM_WORK_MASK)) return; spin_lock_bh(&msk->pm.lock); pr_debug("msk=%p status=%x\n", msk, pm->status); if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); mptcp_pm_addr_send_ack(msk); } if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); mptcp_pm_rm_addr_recv(msk); } __mptcp_pm_kernel_worker(msk); spin_unlock_bh(&msk->pm.lock); } void mptcp_pm_destroy(struct mptcp_sock *msk) { mptcp_pm_free_anno_list(msk); if (mptcp_pm_is_userspace(msk)) mptcp_userspace_pm_free_local_addr_list(msk); } void mptcp_pm_data_reset(struct mptcp_sock *msk) { u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk)); struct mptcp_pm_data *pm = &msk->pm; memset(&pm->reset, 0, sizeof(pm->reset)); pm->rm_list_tx.nr = 0; pm->rm_list_rx.nr = 0; WRITE_ONCE(pm->pm_type, pm_type); if (pm_type == MPTCP_PM_TYPE_KERNEL) { bool subflows_allowed = !!mptcp_pm_get_limit_extra_subflows(msk); /* pm->work_pending must be only be set to 'true' when * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL */ WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_endp_subflow_max(msk) && subflows_allowed) || !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_limit_add_addr_accepted(msk) && subflows_allowed); WRITE_ONCE(pm->accept_subflow, subflows_allowed); bitmap_fill(pm->id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); } } void mptcp_pm_data_init(struct mptcp_sock *msk) { spin_lock_init(&msk->pm.lock); INIT_LIST_HEAD(&msk->pm.anno_list); INIT_LIST_HEAD(&msk->pm.userspace_pm_local_addr_list); mptcp_pm_data_reset(msk); } void __init mptcp_pm_init(void) { mptcp_pm_kernel_register(); mptcp_pm_userspace_register(); mptcp_pm_nl_init(); } /* Must be called with rcu read lock held */ struct mptcp_pm_ops *mptcp_pm_find(const char *name) { struct mptcp_pm_ops *pm_ops; list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) { if (!strcmp(pm_ops->name, name)) return pm_ops; } return NULL; } int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops) { return 0; } int mptcp_pm_register(struct mptcp_pm_ops *pm_ops) { int ret; ret = mptcp_pm_validate(pm_ops); if (ret) return ret; spin_lock(&mptcp_pm_list_lock); if (mptcp_pm_find(pm_ops->name)) { spin_unlock(&mptcp_pm_list_lock); return -EEXIST; } list_add_tail_rcu(&pm_ops->list, &mptcp_pm_list); spin_unlock(&mptcp_pm_list_lock); pr_debug("%s registered\n", pm_ops->name); return 0; } void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops) { /* skip unregistering the default path manager */ if (WARN_ON_ONCE(pm_ops == &mptcp_pm_kernel)) return; spin_lock(&mptcp_pm_list_lock); list_del_rcu(&pm_ops->list); spin_unlock(&mptcp_pm_list_lock); } /* Build string with list of available path manager values. * Similar to tcp_get_available_congestion_control() */ void mptcp_pm_get_available(char *buf, size_t maxlen) { struct mptcp_pm_ops *pm_ops; size_t offs = 0; rcu_read_lock(); list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", pm_ops->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); }
10 10 10 10 980 3 3 981 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007, 2008, 2009 Siemens AG */ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/device.h> #include <net/cfg802154.h> #include <net/rtnetlink.h> #include "ieee802154.h" #include "nl802154.h" #include "sysfs.h" #include "core.h" /* name for sysfs, %d is appended */ #define PHY_NAME "phy" /* RCU-protected (and RTNL for writers) */ LIST_HEAD(cfg802154_rdev_list); int cfg802154_rdev_list_generation; struct wpan_phy *wpan_phy_find(const char *str) { struct device *dev; if (WARN_ON(!str)) return NULL; dev = class_find_device_by_name(&wpan_phy_class, str); if (!dev) return NULL; return container_of(dev, struct wpan_phy, dev); } EXPORT_SYMBOL(wpan_phy_find); struct wpan_phy_iter_data { int (*fn)(struct wpan_phy *phy, void *data); void *data; }; static int wpan_phy_iter(struct device *dev, void *_data) { struct wpan_phy_iter_data *wpid = _data; struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev); return wpid->fn(phy, wpid->data); } int wpan_phy_for_each(int (*fn)(struct wpan_phy *phy, void *data), void *data) { struct wpan_phy_iter_data wpid = { .fn = fn, .data = data, }; return class_for_each_device(&wpan_phy_class, NULL, &wpid, wpan_phy_iter); } EXPORT_SYMBOL(wpan_phy_for_each); struct cfg802154_registered_device * cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx) { struct cfg802154_registered_device *result = NULL, *rdev; ASSERT_RTNL(); list_for_each_entry(rdev, &cfg802154_rdev_list, list) { if (rdev->wpan_phy_idx == wpan_phy_idx) { result = rdev; break; } } return result; } struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx) { struct cfg802154_registered_device *rdev; ASSERT_RTNL(); rdev = cfg802154_rdev_by_wpan_phy_idx(wpan_phy_idx); if (!rdev) return NULL; return &rdev->wpan_phy; } struct wpan_phy * wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) { static atomic_t wpan_phy_counter = ATOMIC_INIT(0); struct cfg802154_registered_device *rdev; size_t alloc_size; alloc_size = sizeof(*rdev) + priv_size; rdev = kzalloc(alloc_size, GFP_KERNEL); if (!rdev) return NULL; rdev->ops = ops; rdev->wpan_phy_idx = atomic_inc_return(&wpan_phy_counter); if (unlikely(rdev->wpan_phy_idx < 0)) { /* ugh, wrapped! */ atomic_dec(&wpan_phy_counter); kfree(rdev); return NULL; } /* atomic_inc_return makes it start at 1, make it start at 0 */ rdev->wpan_phy_idx--; INIT_LIST_HEAD(&rdev->wpan_dev_list); device_initialize(&rdev->wpan_phy.dev); dev_set_name(&rdev->wpan_phy.dev, PHY_NAME "%d", rdev->wpan_phy_idx); rdev->wpan_phy.dev.class = &wpan_phy_class; rdev->wpan_phy.dev.platform_data = rdev; wpan_phy_net_set(&rdev->wpan_phy, &init_net); init_waitqueue_head(&rdev->dev_wait); init_waitqueue_head(&rdev->wpan_phy.sync_txq); spin_lock_init(&rdev->wpan_phy.queue_lock); return &rdev->wpan_phy; } EXPORT_SYMBOL(wpan_phy_new); int wpan_phy_register(struct wpan_phy *phy) { struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(phy); int ret; rtnl_lock(); ret = device_add(&phy->dev); if (ret) { rtnl_unlock(); return ret; } list_add_rcu(&rdev->list, &cfg802154_rdev_list); cfg802154_rdev_list_generation++; /* TODO phy registered lock */ rtnl_unlock(); /* TODO nl802154 phy notify */ return 0; } EXPORT_SYMBOL(wpan_phy_register); void wpan_phy_unregister(struct wpan_phy *phy) { struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(phy); wait_event(rdev->dev_wait, ({ int __count; rtnl_lock(); __count = rdev->opencount; rtnl_unlock(); __count == 0; })); rtnl_lock(); /* TODO nl802154 phy notify */ /* TODO phy registered lock */ WARN_ON(!list_empty(&rdev->wpan_dev_list)); /* First remove the hardware from everywhere, this makes * it impossible to find from userspace. */ list_del_rcu(&rdev->list); synchronize_rcu(); cfg802154_rdev_list_generation++; device_del(&phy->dev); rtnl_unlock(); } EXPORT_SYMBOL(wpan_phy_unregister); void wpan_phy_free(struct wpan_phy *phy) { put_device(&phy->dev); } EXPORT_SYMBOL(wpan_phy_free); static void cfg802154_free_peer_structures(struct wpan_dev *wpan_dev) { struct ieee802154_pan_device *child, *tmp; mutex_lock(&wpan_dev->association_lock); kfree(wpan_dev->parent); wpan_dev->parent = NULL; list_for_each_entry_safe(child, tmp, &wpan_dev->children, node) { list_del(&child->node); kfree(child); } wpan_dev->nchildren = 0; mutex_unlock(&wpan_dev->association_lock); } int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, struct net *net) { struct wpan_dev *wpan_dev; int err = 0; list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { if (!wpan_dev->netdev) continue; wpan_dev->netdev->netns_immutable = false; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); if (err) break; wpan_dev->netdev->netns_immutable = true; } if (err) { /* failed -- clean up to old netns */ net = wpan_phy_net(&rdev->wpan_phy); list_for_each_entry_continue_reverse(wpan_dev, &rdev->wpan_dev_list, list) { if (!wpan_dev->netdev) continue; wpan_dev->netdev->netns_immutable = false; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); WARN_ON(err); wpan_dev->netdev->netns_immutable = true; } return err; } wpan_phy_net_set(&rdev->wpan_phy, net); err = device_rename(&rdev->wpan_phy.dev, dev_name(&rdev->wpan_phy.dev)); WARN_ON(err); return 0; } void cfg802154_dev_free(struct cfg802154_registered_device *rdev) { kfree(rdev); } static void cfg802154_update_iface_num(struct cfg802154_registered_device *rdev, int iftype, int num) { ASSERT_RTNL(); rdev->num_running_ifaces += num; } static int cfg802154_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct cfg802154_registered_device *rdev; if (!wpan_dev) return NOTIFY_DONE; rdev = wpan_phy_to_rdev(wpan_dev->wpan_phy); /* TODO WARN_ON unspec type */ switch (state) { /* TODO NETDEV_DEVTYPE */ case NETDEV_REGISTER: dev->netns_immutable = true; wpan_dev->identifier = ++rdev->wpan_dev_id; list_add_rcu(&wpan_dev->list, &rdev->wpan_dev_list); rdev->devlist_generation++; mutex_init(&wpan_dev->association_lock); INIT_LIST_HEAD(&wpan_dev->children); wpan_dev->max_associations = SZ_16K; wpan_dev->netdev = dev; break; case NETDEV_DOWN: cfg802154_update_iface_num(rdev, wpan_dev->iftype, -1); rdev->opencount--; wake_up(&rdev->dev_wait); break; case NETDEV_UP: cfg802154_update_iface_num(rdev, wpan_dev->iftype, 1); rdev->opencount++; break; case NETDEV_UNREGISTER: cfg802154_free_peer_structures(wpan_dev); /* It is possible to get NETDEV_UNREGISTER * multiple times. To detect that, check * that the interface is still on the list * of registered interfaces, and only then * remove and clean it up. */ if (!list_empty(&wpan_dev->list)) { list_del_rcu(&wpan_dev->list); rdev->devlist_generation++; } /* synchronize (so that we won't find this netdev * from other code any more) and then clear the list * head so that the above code can safely check for * !list_empty() to avoid double-cleanup. */ synchronize_rcu(); INIT_LIST_HEAD(&wpan_dev->list); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static struct notifier_block cfg802154_netdev_notifier = { .notifier_call = cfg802154_netdev_notifier_call, }; static void __net_exit cfg802154_pernet_exit(struct net *net) { struct cfg802154_registered_device *rdev; rtnl_lock(); list_for_each_entry(rdev, &cfg802154_rdev_list, list) { if (net_eq(wpan_phy_net(&rdev->wpan_phy), net)) WARN_ON(cfg802154_switch_netns(rdev, &init_net)); } rtnl_unlock(); } static struct pernet_operations cfg802154_pernet_ops = { .exit = cfg802154_pernet_exit, }; static int __init wpan_phy_class_init(void) { int rc; rc = register_pernet_device(&cfg802154_pernet_ops); if (rc) goto err; rc = wpan_phy_sysfs_init(); if (rc) goto err_sysfs; rc = register_netdevice_notifier(&cfg802154_netdev_notifier); if (rc) goto err_nl; rc = ieee802154_nl_init(); if (rc) goto err_notifier; rc = nl802154_init(); if (rc) goto err_ieee802154_nl; return 0; err_ieee802154_nl: ieee802154_nl_exit(); err_notifier: unregister_netdevice_notifier(&cfg802154_netdev_notifier); err_nl: wpan_phy_sysfs_exit(); err_sysfs: unregister_pernet_device(&cfg802154_pernet_ops); err: return rc; } subsys_initcall(wpan_phy_class_init); static void __exit wpan_phy_class_exit(void) { nl802154_exit(); ieee802154_nl_exit(); unregister_netdevice_notifier(&cfg802154_netdev_notifier); wpan_phy_sysfs_exit(); unregister_pernet_device(&cfg802154_pernet_ops); } module_exit(wpan_phy_class_exit); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("IEEE 802.15.4 configuration interface"); MODULE_AUTHOR("Dmitry Eremin-Solenikov");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_FS_H #define BTRFS_FS_H #include <linux/blkdev.h> #include <linux/sizes.h> #include <linux/time64.h> #include <linux/compiler.h> #include <linux/math.h> #include <linux/atomic.h> #include <linux/percpu_counter.h> #include <linux/completion.h> #include <linux/lockdep.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/semaphore.h> #include <linux/list.h> #include <linux/pagemap.h> #include <linux/radix-tree.h> #include <linux/workqueue.h> #include <linux/wait.h> #include <linux/wait_bit.h> #include <linux/sched.h> #include <linux/rbtree.h> #include <uapi/linux/btrfs.h> #include <uapi/linux/btrfs_tree.h> #include "extent-io-tree.h" #include "async-thread.h" #include "block-rsv.h" #include "messages.h" struct inode; struct super_block; struct kobject; struct reloc_control; struct crypto_shash; struct ulist; struct btrfs_device; struct btrfs_block_group; struct btrfs_root; struct btrfs_fs_devices; struct btrfs_transaction; struct btrfs_delayed_root; struct btrfs_balance_control; struct btrfs_subpage_info; struct btrfs_stripe_hash_table; struct btrfs_space_info; /* * Minimum data and metadata block size. * * Normally it's 4K, but for testing subpage block size on 4K page systems, we * allow DEBUG builds to accept 2K page size. */ #ifdef CONFIG_BTRFS_DEBUG #define BTRFS_MIN_BLOCKSIZE (SZ_2K) #else #define BTRFS_MIN_BLOCKSIZE (SZ_4K) #endif #define BTRFS_MAX_BLOCKSIZE (SZ_64K) #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL #define BTRFS_EMPTY_DIR_SIZE 0 #define BTRFS_DIRTY_METADATA_THRESH SZ_32M #define BTRFS_SUPER_INFO_OFFSET SZ_64K #define BTRFS_SUPER_INFO_SIZE 4096 static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define BTRFS_CSUM_FMT "0x%*phN" #define BTRFS_CSUM_FMT_VALUE(size, bytes) size, bytes #define BTRFS_KEY_FMT "(%llu %u %llu)" #define BTRFS_KEY_FMT_VALUE(key) (key)->objectid, (key)->type, (key)->offset /* * Number of metadata items necessary for an unlink operation: * * 1 for the possible orphan item * 1 for the dir item * 1 for the dir index * 1 for the inode ref * 1 for the inode * 1 for the parent inode */ #define BTRFS_UNLINK_METADATA_UNITS 6 /* * The reserved space at the beginning of each device. It covers the primary * super block and leaves space for potential use by other tools like * bootloaders or to lower potential damage of accidental overwrite. */ #define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) /* * Runtime (in-memory) states of filesystem */ enum { /* * Filesystem is being remounted, allow to skip some operations, like * defrag */ BTRFS_FS_STATE_REMOUNTING, /* Filesystem in RO mode */ BTRFS_FS_STATE_RO, /* Track if a transaction abort has been reported on this filesystem */ BTRFS_FS_STATE_TRANS_ABORTED, /* Track if log replay has failed. */ BTRFS_FS_STATE_LOG_REPLAY_ABORTED, /* * Bio operations should be blocked on this filesystem because a source * or target device is being destroyed as part of a device replace */ BTRFS_FS_STATE_DEV_REPLACING, /* The btrfs_fs_info created for self-tests */ BTRFS_FS_STATE_DUMMY_FS_INFO, /* Checksum errors are ignored. */ BTRFS_FS_STATE_NO_DATA_CSUMS, BTRFS_FS_STATE_SKIP_META_CSUMS, /* Indicates there was an error cleaning up a log tree. */ BTRFS_FS_STATE_LOG_CLEANUP_ERROR, /* No more delayed iput can be queued. */ BTRFS_FS_STATE_NO_DELAYED_IPUT, /* * Emergency shutdown, a step further than transaction aborted by * rejecting all operations. */ BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, BTRFS_FS_STATE_COUNT }; enum { BTRFS_FS_CLOSING_START, BTRFS_FS_CLOSING_DONE, BTRFS_FS_LOG_RECOVERING, BTRFS_FS_OPEN, BTRFS_FS_QUOTA_ENABLED, BTRFS_FS_UPDATE_UUID_TREE_GEN, BTRFS_FS_CREATING_FREE_SPACE_TREE, BTRFS_FS_BTREE_ERR, BTRFS_FS_LOG1_ERR, BTRFS_FS_LOG2_ERR, BTRFS_FS_QUOTA_OVERRIDE, /* Used to record internally whether fs has been frozen */ BTRFS_FS_FROZEN, /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. */ BTRFS_FS_BALANCE_RUNNING, /* * Indicate that relocation of a chunk has started, it's set per chunk * and is toggled between chunks. */ BTRFS_FS_RELOC_RUNNING, /* Indicate that the cleaner thread is awake and doing something. */ BTRFS_FS_CLEANER_RUNNING, /* * The checksumming has an optimized version and is considered fast, * so we don't need to offload checksums to workqueues. */ BTRFS_FS_CSUM_IMPL_FAST, /* Indicate that the discard workqueue can service discards. */ BTRFS_FS_DISCARD_RUNNING, /* Indicate that we need to cleanup space cache v1 */ BTRFS_FS_CLEANUP_SPACE_CACHE_V1, /* Indicate that we can't trust the free space tree for caching yet */ BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, /* Indicate whether there are any tree modification log users */ BTRFS_FS_TREE_MOD_LOG_USERS, /* Indicate that we want the transaction kthread to commit right now. */ BTRFS_FS_COMMIT_TRANS, /* Indicate we have half completed snapshot deletions pending. */ BTRFS_FS_UNFINISHED_DROPS, /* Indicate we have to finish a zone to do next allocation. */ BTRFS_FS_NEED_ZONE_FINISH, /* Indicate that we want to commit the transaction. */ BTRFS_FS_NEED_TRANS_COMMIT, /* This is set when active zone tracking is needed. */ BTRFS_FS_ACTIVE_ZONE_TRACKING, /* * Indicate if we have some features changed, this is mostly for * cleaner thread to update the sysfs interface. */ BTRFS_FS_FEATURE_CHANGED, /* * Indicate that we have found a tree block which is only aligned to * sectorsize, but not to nodesize. This should be rare nowadays. */ BTRFS_FS_UNALIGNED_TREE_BLOCK, #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, BTRFS_FS_32BIT_WARN, #endif }; /* * Flags for mount options. * * Note: don't forget to add new options to btrfs_show_options() */ enum { BTRFS_MOUNT_NODATASUM = (1ULL << 0), BTRFS_MOUNT_NODATACOW = (1ULL << 1), BTRFS_MOUNT_NOBARRIER = (1ULL << 2), BTRFS_MOUNT_SSD = (1ULL << 3), BTRFS_MOUNT_DEGRADED = (1ULL << 4), BTRFS_MOUNT_COMPRESS = (1ULL << 5), BTRFS_MOUNT_NOTREELOG = (1ULL << 6), BTRFS_MOUNT_FLUSHONCOMMIT = (1ULL << 7), BTRFS_MOUNT_SSD_SPREAD = (1ULL << 8), BTRFS_MOUNT_NOSSD = (1ULL << 9), BTRFS_MOUNT_DISCARD_SYNC = (1ULL << 10), BTRFS_MOUNT_FORCE_COMPRESS = (1ULL << 11), BTRFS_MOUNT_SPACE_CACHE = (1ULL << 12), BTRFS_MOUNT_CLEAR_CACHE = (1ULL << 13), BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1ULL << 14), BTRFS_MOUNT_ENOSPC_DEBUG = (1ULL << 15), BTRFS_MOUNT_AUTO_DEFRAG = (1ULL << 16), BTRFS_MOUNT_USEBACKUPROOT = (1ULL << 17), BTRFS_MOUNT_SKIP_BALANCE = (1ULL << 18), BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1ULL << 19), BTRFS_MOUNT_RESCAN_UUID_TREE = (1ULL << 20), BTRFS_MOUNT_FRAGMENT_DATA = (1ULL << 21), BTRFS_MOUNT_FRAGMENT_METADATA = (1ULL << 22), BTRFS_MOUNT_FREE_SPACE_TREE = (1ULL << 23), BTRFS_MOUNT_NOLOGREPLAY = (1ULL << 24), BTRFS_MOUNT_REF_VERIFY = (1ULL << 25), BTRFS_MOUNT_DISCARD_ASYNC = (1ULL << 26), BTRFS_MOUNT_IGNOREBADROOTS = (1ULL << 27), BTRFS_MOUNT_IGNOREDATACSUMS = (1ULL << 28), BTRFS_MOUNT_NODISCARD = (1ULL << 29), BTRFS_MOUNT_NOSPACECACHE = (1ULL << 30), BTRFS_MOUNT_IGNOREMETACSUMS = (1ULL << 31), BTRFS_MOUNT_IGNORESUPERFLAGS = (1ULL << 32), BTRFS_MOUNT_REF_TRACKER = (1ULL << 33), }; /* * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP \ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ BTRFS_FEATURE_COMPAT_RO_VERITY | \ BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_INCOMPAT_SUPP_STABLE \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ BTRFS_FEATURE_INCOMPAT_ZONED | \ BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) #ifdef CONFIG_BTRFS_EXPERIMENTAL /* * Features under development like Extent tree v2 support is enabled * only under CONFIG_BTRFS_EXPERIMENTAL */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) #else #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE) #endif #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_WARNING_COMMIT_INTERVAL (300) #define BTRFS_DEFAULT_MAX_INLINE (2048) enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, BTRFS_COMPRESS_ZLIB = 1, BTRFS_COMPRESS_LZO = 2, BTRFS_COMPRESS_ZSTD = 3, BTRFS_NR_COMPRESS_TYPES = 4, BTRFS_DEFRAG_DONT_COMPRESS, }; struct btrfs_dev_replace { /* See #define above */ u64 replace_state; /* Seconds since 1-Jan-1970 */ time64_t time_started; /* Seconds since 1-Jan-1970 */ time64_t time_stopped; atomic64_t num_write_errors; atomic64_t num_uncorrectable_read_errors; u64 cursor_left; u64 committed_cursor_left; u64 cursor_left_last_write_of_item; u64 cursor_right; /* See #define above */ u64 cont_reading_from_srcdev_mode; int is_valid; int item_needs_writeback; struct btrfs_device *srcdev; struct btrfs_device *tgtdev; struct mutex lock_finishing_cancel_unmount; struct rw_semaphore rwsem; struct btrfs_scrub_progress scrub_progress; struct percpu_counter bio_counter; wait_queue_head_t replace_wait; struct task_struct *replace_task; }; /* * Free clusters are used to claim free space in relatively large chunks, * allowing us to do less seeky writes. They are used for all metadata * allocations. In ssd_spread mode they are also used for data allocations. */ struct btrfs_free_cluster { spinlock_t lock; spinlock_t refill_lock; struct rb_root root; /* Largest extent in this cluster */ u64 max_size; /* First extent starting offset */ u64 window_start; /* We did a full search and couldn't create a cluster */ bool fragmented; struct btrfs_block_group *block_group; /* * When a cluster is allocated from a block group, we put the cluster * onto a list in the block group so that it can be freed before the * block group is freed. */ struct list_head block_group_list; }; /* Discard control. */ /* * Async discard uses multiple lists to differentiate the discard filter * parameters. Index 0 is for completely free block groups where we need to * ensure the entire block group is trimmed without being lossy. Indices * afterwards represent monotonically decreasing discard filter sizes to * prioritize what should be discarded next. */ #define BTRFS_NR_DISCARD_LISTS 3 #define BTRFS_DISCARD_INDEX_UNUSED 0 #define BTRFS_DISCARD_INDEX_START 1 struct btrfs_discard_ctl { struct workqueue_struct *discard_workers; struct delayed_work work; spinlock_t lock; struct btrfs_block_group *block_group; struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; u64 prev_discard; u64 prev_discard_time; atomic_t discardable_extents; atomic64_t discardable_bytes; u64 max_discard_size; u64 delay_ms; u32 iops_limit; u32 kbps_limit; u64 discard_extent_bytes; u64 discard_bitmap_bytes; atomic64_t discard_bytes_saved; }; /* * Exclusive operations (device replace, resize, device add/remove, balance) */ enum btrfs_exclusive_operation { BTRFS_EXCLOP_NONE, BTRFS_EXCLOP_BALANCE_PAUSED, BTRFS_EXCLOP_BALANCE, BTRFS_EXCLOP_DEV_ADD, BTRFS_EXCLOP_DEV_REMOVE, BTRFS_EXCLOP_DEV_REPLACE, BTRFS_EXCLOP_RESIZE, BTRFS_EXCLOP_SWAP_ACTIVATE, }; /* Store data about transaction commits, exported via sysfs. */ struct btrfs_commit_stats { /* Total number of commits */ u64 commit_count; /* The maximum commit duration so far in ns */ u64 max_commit_dur; /* The last commit duration in ns */ u64 last_commit_dur; /* The total commit duration in ns */ u64 total_commit_dur; /* Start of the last critical section in ns. */ u64 critical_section_start_time; }; struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; struct btrfs_root *tree_root; struct btrfs_root *chunk_root; struct btrfs_root *dev_root; struct btrfs_root *fs_root; struct btrfs_root *quota_root; struct btrfs_root *uuid_root; struct btrfs_root *data_reloc_root; struct btrfs_root *block_group_root; struct btrfs_root *stripe_root; /* The log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; /* The tree that holds the global roots (csum, extent, etc) */ rwlock_t global_root_lock; struct rb_root global_root_tree; spinlock_t fs_roots_radix_lock; struct radix_tree_root fs_roots_radix; /* Block group cache stuff */ rwlock_t block_group_cache_lock; struct rb_root_cached block_group_cache_tree; /* Keep track of unallocated space */ atomic64_t free_chunk_space; /* Track ranges which are used by log trees blocks/logged data extents */ struct extent_io_tree excluded_extents; /* logical->physical extent mapping */ struct rb_root_cached mapping_tree; rwlock_t mapping_tree_lock; /* * Block reservation for extent, checksum, root tree and delayed dir * index item. */ struct btrfs_block_rsv global_block_rsv; /* Block reservation for metadata operations */ struct btrfs_block_rsv trans_block_rsv; /* Block reservation for chunk tree */ struct btrfs_block_rsv chunk_block_rsv; /* Block reservation for delayed operations */ struct btrfs_block_rsv delayed_block_rsv; /* Block reservation for delayed refs */ struct btrfs_block_rsv delayed_refs_rsv; /* Block reservation for treelog tree */ struct btrfs_block_rsv treelog_rsv; struct btrfs_block_rsv empty_block_rsv; /* * Updated while holding the lock 'trans_lock'. Due to the life cycle of * a transaction, it can be directly read while holding a transaction * handle, everywhere else must be read with btrfs_get_fs_generation(). * Should always be updated using btrfs_set_fs_generation(). */ u64 generation; /* * Always use btrfs_get_last_trans_committed() and * btrfs_set_last_trans_committed() to read and update this field. */ u64 last_trans_committed; /* * Generation of the last transaction used for block group relocation * since the filesystem was last mounted (or 0 if none happened yet). * Must be written and read while holding btrfs_fs_info::commit_root_sem. */ u64 last_reloc_trans; /* * This is updated to the current trans every time a full commit is * required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; unsigned long long mount_opt; /* Compress related structures. */ void *compr_wsm[BTRFS_NR_COMPRESS_TYPES]; int compress_type; int compress_level; u32 commit_interval; /* * It is a suggestive number, the read side is safe even it gets a * wrong number because we will write out the data into a regular * extent. The write side(mount/remount) is under ->s_umount lock, * so it is also safe. */ u64 max_inline; struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; wait_queue_head_t transaction_wait; wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; /* * Used to protect the incompat_flags, compat_flags, compat_ro_flags * when they are updated. * * Because we do not clear the flags for ever, so we needn't use * the lock on the read side. * * We also needn't use the lock when we mount the fs, because * there is no other task which will update the flag. */ spinlock_t super_lock; struct btrfs_super_block *super_copy; struct btrfs_super_block *super_for_commit; struct super_block *sb; struct inode *btree_inode; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; struct mutex chunk_mutex; /* * This is taken to make sure we don't set block groups ro after the * free space cache has been allocated on them. */ struct mutex ro_block_group_mutex; /* * This is used during read/modify/write to make sure no two ios are * trying to mod the same stripe at the same time. */ struct btrfs_stripe_hash_table *stripe_hash_table; /* * This protects the ordered operations list only while we are * processing all of the entries on it. This way we make sure the * commit code doesn't find the list temporarily empty because another * function happens to be doing non-waiting preflush before jumping * into the main commit. */ struct mutex ordered_operations_mutex; struct rw_semaphore commit_root_sem; struct rw_semaphore cleanup_work_sem; struct rw_semaphore subvol_sem; spinlock_t trans_lock; /* * The reloc mutex goes with the trans lock, it is taken during commit * to protect us from the relocation code. */ struct mutex reloc_mutex; struct list_head trans_list; struct list_head dead_roots; struct list_head caching_block_groups; spinlock_t delayed_iput_lock; struct list_head delayed_iputs; atomic_t nr_delayed_iputs; wait_queue_head_t delayed_iputs_wait; atomic64_t tree_mod_seq; /* This protects tree_mod_log and tree_mod_seq_list */ rwlock_t tree_mod_log_lock; struct rb_root tree_mod_log; struct list_head tree_mod_seq_list; atomic_t async_delalloc_pages; /* This is used to protect the following list -- ordered_roots. */ spinlock_t ordered_root_lock; /* * All fs/file tree roots in which there are data=ordered extents * pending writeback are added into this list. * * These can span multiple transactions and basically include every * dirty data page that isn't from nodatacow. */ struct list_head ordered_roots; struct mutex delalloc_root_mutex; spinlock_t delalloc_root_lock; /* All fs/file tree roots that have delalloc inodes. */ struct list_head delalloc_roots; /* * There is a pool of worker threads for checksumming during writes and * a pool for checksumming after reads. This is because readers can * run with FS locks held, and the writers may be waiting for those * locks. We don't want ordering in the pending list to cause * deadlocks, and so the two are serviced separately. * * A third pool does submit_bio to avoid deadlocking with the other two. */ struct btrfs_workqueue *workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; struct workqueue_struct *endio_workers; struct workqueue_struct *endio_meta_workers; struct workqueue_struct *rmw_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *caching_workers; /* * Fixup workers take dirty pages that didn't properly go through the * cow mechanism and make them safe to write. It happens for the * sys_munmap function call path. */ struct btrfs_workqueue *fixup_workers; struct btrfs_workqueue *delayed_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; u32 thread_pool_size; struct kobject *space_info_kobj; struct kobject *qgroups_kobj; struct kobject *discard_kobj; /* Track the number of blocks (sectors) read by the filesystem. */ struct percpu_counter stats_read_blocks; /* Used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; struct percpu_counter ordered_bytes; s32 dirty_metadata_batch; s32 delalloc_batch; struct percpu_counter evictable_extent_maps; u64 em_shrinker_last_root; u64 em_shrinker_last_ino; atomic64_t em_shrinker_nr_to_scan; struct work_struct em_shrinker_work; /* Protected by 'trans_lock'. */ struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; /* * The space_info list is effectively read only after initial setup. * It is populated at mount time and cleaned up after all block groups * are removed. RCU is used to protect it. */ struct list_head space_info; struct btrfs_space_info *data_sinfo; struct reloc_control *reloc_ctl; /* data_alloc_cluster is only used in ssd_spread mode */ struct btrfs_free_cluster data_alloc_cluster; /* All metadata allocations go through this cluster. */ struct btrfs_free_cluster meta_alloc_cluster; /* Auto defrag inodes go here. */ spinlock_t defrag_inodes_lock; struct rb_root defrag_inodes; atomic_t defrag_running; /* Used to protect avail_{data, metadata, system}_alloc_bits */ seqlock_t profiles_lock; /* * These three are in extended format (availability of single chunks is * denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted * by corresponding BTRFS_BLOCK_GROUP_* bits) */ u64 avail_data_alloc_bits; u64 avail_metadata_alloc_bits; u64 avail_system_alloc_bits; /* Balance state */ spinlock_t balance_lock; struct mutex balance_mutex; atomic_t balance_pause_req; atomic_t balance_cancel_req; struct btrfs_balance_control *balance_ctl; wait_queue_head_t balance_wait_q; /* Cancellation requests for chunk relocation */ atomic_t reloc_cancel_req; u32 data_chunk_allocations; u32 metadata_ratio; /* Private scrub information */ struct mutex scrub_lock; atomic_t scrubs_running; atomic_t scrub_pause_req; atomic_t scrubs_paused; atomic_t scrub_cancel_req; wait_queue_head_t scrub_pause_wait; /* * The worker pointers are NULL iff the refcount is 0, ie. scrub is not * running. */ refcount_t scrub_workers_refcnt; struct workqueue_struct *scrub_workers; struct btrfs_discard_ctl discard_ctl; /* Is qgroup tracking in a consistent state? */ u64 qgroup_flags; /* Holds configuration and tracking. Protected by qgroup_lock. */ struct rb_root qgroup_tree; spinlock_t qgroup_lock; /* * Protect user change for quota operations. If a transaction is needed, * it must be started before locking this lock. */ struct mutex qgroup_ioctl_lock; /* List of dirty qgroups to be written at next commit. */ struct list_head dirty_qgroups; /* Used by qgroup for an efficient tree traversal. */ u64 qgroup_seq; /* Qgroup rescan items. */ /* Protects the progress item */ struct mutex qgroup_rescan_lock; struct btrfs_key qgroup_rescan_progress; struct btrfs_workqueue *qgroup_rescan_workers; struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; /* Protected by qgroup_rescan_lock */ bool qgroup_rescan_running; u8 qgroup_drop_subtree_thres; u64 qgroup_enable_gen; /* * If this is not 0, then it indicates a serious filesystem error has * happened and it contains that error (negative errno value). */ int fs_error; /* Filesystem state */ unsigned long fs_state; struct btrfs_delayed_root *delayed_root; /* Entries are eb->start >> nodesize_bits */ struct xarray buffer_tree; /* Next backup root to be overwritten */ int backup_root_index; /* Device replace state */ struct btrfs_dev_replace dev_replace; struct semaphore uuid_tree_rescan_sem; /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; struct work_struct async_data_reclaim_work; struct work_struct preempt_reclaim_work; /* Reclaim partially filled block groups in the background */ struct work_struct reclaim_bgs_work; /* Protected by unused_bgs_lock. */ struct list_head reclaim_bgs; int bg_reclaim_threshold; /* Protects the lists unused_bgs and reclaim_bgs. */ spinlock_t unused_bgs_lock; /* Protected by unused_bgs_lock. */ struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; /* Protect block groups that are going to be deleted */ struct mutex reclaim_bgs_lock; /* Cached block sizes */ u32 nodesize; u32 nodesize_bits; u32 sectorsize; /* ilog2 of sectorsize, use to avoid 64bit division */ u32 sectorsize_bits; u32 block_min_order; u32 block_max_order; u32 csum_size; u32 csums_per_leaf; u32 stripesize; /* * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular * filesystem, on zoned it depends on the device constraints. */ u64 max_extent_size; /* Block groups and devices containing active swapfiles. */ spinlock_t swapfile_pins_lock; struct rb_root swapfile_pins; struct crypto_shash *csum_shash; /* Type of exclusive operation running, protected by super_lock */ enum btrfs_exclusive_operation exclusive_operation; /* * Zone size > 0 when in ZONED mode, otherwise it's used for a check * if the mode is enabled */ u64 zone_size; /* Constraints for ZONE_APPEND commands: */ struct queue_limits limits; u64 max_zone_append_size; struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; /* * Start of the dedicated data relocation block group, protected by * relocation_bg_lock. */ spinlock_t relocation_bg_lock; u64 data_reloc_bg; struct mutex zoned_data_reloc_io_lock; struct btrfs_block_group *active_meta_bg; struct btrfs_block_group *active_system_bg; u64 nr_global_roots; spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; /* Updates are not protected by any lock */ struct btrfs_commit_stats commit_stats; /* * Last generation where we dropped a non-relocation root. * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() * to change it and to read it, respectively. */ u64 last_root_drop_gen; /* * Annotations for transaction events (structures are empty when * compiled without lockdep). */ struct lockdep_map btrfs_trans_num_writers_map; struct lockdep_map btrfs_trans_num_extwriters_map; struct lockdep_map btrfs_state_change_map[4]; struct lockdep_map btrfs_trans_pending_ordered_map; struct lockdep_map btrfs_ordered_extent_map; #ifdef CONFIG_BTRFS_DEBUG spinlock_t ref_verify_lock; struct rb_root block_tree; struct kobject *debug_kobj; struct list_head allocated_roots; spinlock_t eb_leak_lock; struct list_head allocated_ebs; #endif }; #define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \ struct folio *: (_folio))->mapping->host)) #define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info) #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ struct inode *: (_inode)))->root->fs_info) static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { return mapping_gfp_constraint(mapping, ~__GFP_FS); } /* Return the minimal folio size of the fs. */ static inline unsigned int btrfs_min_folio_size(struct btrfs_fs_info *fs_info) { return 1U << (PAGE_SHIFT + fs_info->block_min_order); } static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) { return READ_ONCE(fs_info->generation); } static inline void btrfs_set_fs_generation(struct btrfs_fs_info *fs_info, u64 gen) { WRITE_ONCE(fs_info->generation, gen); } static inline u64 btrfs_get_last_trans_committed(const struct btrfs_fs_info *fs_info) { return READ_ONCE(fs_info->last_trans_committed); } static inline void btrfs_set_last_trans_committed(struct btrfs_fs_info *fs_info, u64 gen) { WRITE_ONCE(fs_info->last_trans_committed, gen); } static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, u64 gen) { WRITE_ONCE(fs_info->last_root_drop_gen, gen); } static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) { return READ_ONCE(fs_info->last_root_drop_gen); } /* * Take the number of bytes to be checksummed and figure out how many leaves * it would require to store the csums for that many bytes. */ static inline u64 btrfs_csum_bytes_to_leaves( const struct btrfs_fs_info *fs_info, u64 csum_bytes) { const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); } /* * Use this if we would be adding new items, as we could split nodes as we cow * down the tree. */ static inline u64 btrfs_calc_insert_metadata_size(const struct btrfs_fs_info *fs_info, unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; } /* * Doing a truncate or a modification won't result in new nodes or leaves, just * what we need for COW. */ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, unsigned num_items) { return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) ((bytes) >> (fs_info)->sectorsize_bits) static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) { return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0; } /* * Count how many fs_info->max_extent_size cover the @size */ static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 size) { #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (!fs_info) return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); #endif return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); } static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs_info, const struct folio *folio) { return folio_size(folio) >> fs_info->sectorsize_bits; } bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize); bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation op); int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args); u16 btrfs_csum_type_size(u16 type); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); const char *btrfs_super_csum_driver(u16 csum_type); size_t __attribute_const__ btrfs_get_num_csums(void); static inline bool btrfs_is_empty_uuid(const u8 *uuid) { return uuid_is_null((const uuid_t *)uuid); } /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, const char *name); void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, const char *name); #define __btrfs_fs_incompat(fs_info, flags) \ (!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags))) #define __btrfs_fs_compat_ro(fs_info, flags) \ (!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags))) #define btrfs_set_fs_incompat(__fs_info, opt) \ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) #define btrfs_clear_fs_incompat(__fs_info, opt) \ __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) #define btrfs_fs_incompat(fs_info, opt) \ __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) #define btrfs_set_fs_compat_ro(__fs_info, opt) \ __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) #define btrfs_clear_fs_compat_ro(__fs_info, opt) \ __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) #define btrfs_fs_compat_ro(fs_info, opt) \ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) static inline int btrfs_fs_closing(const struct btrfs_fs_info *fs_info) { /* Do it this way so we only ever do one test_bit in the normal case. */ if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) return 2; return 1; } return 0; } /* * If we remount the fs to be R/O or umount the fs, the cleaner needn't do * anything except sleeping. This function is used to check the status of * the fs. * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, * since setting and checking for SB_RDONLY in the superblock's flags is not * atomic. */ static inline int btrfs_need_cleaner_sleep(const struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || btrfs_fs_closing(fs_info); } static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) { clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); } #define BTRFS_FS_ERROR(fs_info) (READ_ONCE((fs_info)->fs_error)) #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) static inline bool btrfs_is_shutdown(struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state); } static inline void btrfs_force_shutdown(struct btrfs_fs_info *fs_info) { /* * Here we do not want to use handle_fs_error(), which will mark the fs * read-only. * Some call sites like shutdown ioctl will mark the fs shutdown when * the fs is frozen. But thaw path will handle RO and RW fs * differently. * * So here we only mark the fs error without flipping it RO. */ WRITE_ONCE(fs_info->fs_error, -EIO); if (!test_and_set_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state)) btrfs_crit(fs_info, "emergency shutdown"); } /* * We use folio flag owner_2 to indicate there is an ordered extent with * unfinished IO. */ #define folio_test_ordered(folio) folio_test_owner_2(folio) #define folio_set_ordered(folio) folio_set_owner_2(folio) #define folio_clear_ordered(folio) folio_clear_owner_2(folio) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS #define EXPORT_FOR_TESTS static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info) { return unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state)); } void btrfs_test_destroy_inode(struct inode *inode); #else #define EXPORT_FOR_TESTS static static inline bool btrfs_is_testing(const struct btrfs_fs_info *fs_info) { return false; } #endif #endif
14 14 17 14 14 3 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2005 Marc Kleine-Budde, Pengutronix * Copyright (C) 2006 Andrey Volkov, Varma Electronics * Copyright (C) 2008-2009 Wolfgang Grandegger <wg@grandegger.com> */ #include <linux/can/dev.h> #include <linux/module.h> #define MOD_DESC "CAN device driver interface" MODULE_DESCRIPTION(MOD_DESC); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Wolfgang Grandegger <wg@grandegger.com>"); /* Local echo of CAN messages * * CAN network devices *should* support a local echo functionality * (see Documentation/networking/can.rst). To test the handling of CAN * interfaces that do not support the local echo both driver types are * implemented. In the case that the driver does not support the echo * the IFF_ECHO remains clear in dev->flags. This causes the PF_CAN core * to perform the echo as a fallback solution. */ void can_flush_echo_skb(struct net_device *dev) { struct can_priv *priv = netdev_priv(dev); struct net_device_stats *stats = &dev->stats; int i; for (i = 0; i < priv->echo_skb_max; i++) { if (priv->echo_skb[i]) { kfree_skb(priv->echo_skb[i]); priv->echo_skb[i] = NULL; stats->tx_dropped++; stats->tx_aborted_errors++; } } } /* Put the skb on the stack to be looped backed locally lateron * * The function is typically called in the start_xmit function * of the device driver. The driver must protect access to * priv->echo_skb, if necessary. */ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx, unsigned int frame_len) { struct can_priv *priv = netdev_priv(dev); if (idx >= priv->echo_skb_max) { netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", __func__, idx, priv->echo_skb_max); return -EINVAL; } /* check flag whether this packet has to be looped back */ if (!(dev->flags & IFF_ECHO) || (skb->protocol != htons(ETH_P_CAN) && skb->protocol != htons(ETH_P_CANFD) && skb->protocol != htons(ETH_P_CANXL))) { kfree_skb(skb); return 0; } if (!priv->echo_skb[idx]) { skb = can_create_echo_skb(skb); if (!skb) return -ENOMEM; /* make settings for echo to reduce code in irq context */ skb->ip_summed = CHECKSUM_UNNECESSARY; skb->dev = dev; /* save frame_len to reuse it when transmission is completed */ can_skb_prv(skb)->frame_len = frame_len; if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; skb_tx_timestamp(skb); /* save this skb for tx interrupt echo handling */ priv->echo_skb[idx] = skb; } else { /* locking problem with netif_stop_queue() ?? */ netdev_err(dev, "%s: BUG! echo_skb %d is occupied!\n", __func__, idx); kfree_skb(skb); return -EBUSY; } return 0; } EXPORT_SYMBOL_GPL(can_put_echo_skb); struct sk_buff * __can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *len_ptr, unsigned int *frame_len_ptr) { struct can_priv *priv = netdev_priv(dev); if (idx >= priv->echo_skb_max) { netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", __func__, idx, priv->echo_skb_max); return NULL; } if (priv->echo_skb[idx]) { /* Using "struct canfd_frame::len" for the frame * length is supported on both CAN and CANFD frames. */ struct sk_buff *skb = priv->echo_skb[idx]; struct can_skb_priv *can_skb_priv = can_skb_prv(skb); if (skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS) skb_tstamp_tx(skb, skb_hwtstamps(skb)); /* get the real payload length for netdev statistics */ *len_ptr = can_skb_get_data_len(skb); if (frame_len_ptr) *frame_len_ptr = can_skb_priv->frame_len; priv->echo_skb[idx] = NULL; if (skb->pkt_type == PACKET_LOOPBACK) { skb->pkt_type = PACKET_BROADCAST; } else { dev_consume_skb_any(skb); return NULL; } return skb; } return NULL; } /* Get the skb from the stack and loop it back locally * * The function is typically called when the TX done interrupt * is handled in the device driver. The driver must protect * access to priv->echo_skb, if necessary. */ unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr) { struct sk_buff *skb; unsigned int len; skb = __can_get_echo_skb(dev, idx, &len, frame_len_ptr); if (!skb) return 0; skb_get(skb); if (netif_rx(skb) == NET_RX_SUCCESS) dev_consume_skb_any(skb); else dev_kfree_skb_any(skb); return len; } EXPORT_SYMBOL_GPL(can_get_echo_skb); /* Remove the skb from the stack and free it. * * The function is typically called when TX failed. */ void can_free_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr) { struct can_priv *priv = netdev_priv(dev); if (idx >= priv->echo_skb_max) { netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", __func__, idx, priv->echo_skb_max); return; } if (priv->echo_skb[idx]) { struct sk_buff *skb = priv->echo_skb[idx]; struct can_skb_priv *can_skb_priv = can_skb_prv(skb); if (frame_len_ptr) *frame_len_ptr = can_skb_priv->frame_len; dev_kfree_skb_any(skb); priv->echo_skb[idx] = NULL; } } EXPORT_SYMBOL_GPL(can_free_echo_skb); /* fill common values for CAN sk_buffs */ static void init_can_skb_reserve(struct sk_buff *skb) { skb->pkt_type = PACKET_BROADCAST; skb->ip_summed = CHECKSUM_UNNECESSARY; skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); can_skb_reserve(skb); can_skb_prv(skb)->skbcnt = 0; } struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf) { struct sk_buff *skb; skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + sizeof(struct can_frame)); if (unlikely(!skb)) { *cf = NULL; return NULL; } skb->protocol = htons(ETH_P_CAN); init_can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; *cf = skb_put_zero(skb, sizeof(struct can_frame)); return skb; } EXPORT_SYMBOL_GPL(alloc_can_skb); struct sk_buff *alloc_canfd_skb(struct net_device *dev, struct canfd_frame **cfd) { struct sk_buff *skb; skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + sizeof(struct canfd_frame)); if (unlikely(!skb)) { *cfd = NULL; return NULL; } skb->protocol = htons(ETH_P_CANFD); init_can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; *cfd = skb_put_zero(skb, sizeof(struct canfd_frame)); /* set CAN FD flag by default */ (*cfd)->flags = CANFD_FDF; return skb; } EXPORT_SYMBOL_GPL(alloc_canfd_skb); struct sk_buff *alloc_canxl_skb(struct net_device *dev, struct canxl_frame **cxl, unsigned int data_len) { struct sk_buff *skb; if (data_len < CANXL_MIN_DLEN || data_len > CANXL_MAX_DLEN) goto out_error; skb = netdev_alloc_skb(dev, sizeof(struct can_skb_priv) + CANXL_HDR_SIZE + data_len); if (unlikely(!skb)) goto out_error; skb->protocol = htons(ETH_P_CANXL); init_can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; *cxl = skb_put_zero(skb, CANXL_HDR_SIZE + data_len); /* set CAN XL flag and length information by default */ (*cxl)->flags = CANXL_XLF; (*cxl)->len = data_len; return skb; out_error: *cxl = NULL; return NULL; } EXPORT_SYMBOL_GPL(alloc_canxl_skb); struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf) { struct sk_buff *skb; skb = alloc_can_skb(dev, cf); if (unlikely(!skb)) return NULL; (*cf)->can_id = CAN_ERR_FLAG; (*cf)->len = CAN_ERR_DLC; return skb; } EXPORT_SYMBOL_GPL(alloc_can_err_skb); /* Check for outgoing skbs that have not been created by the CAN subsystem */ static bool can_skb_headroom_valid(struct net_device *dev, struct sk_buff *skb) { /* af_packet creates a headroom of HH_DATA_MOD bytes which is fine */ if (WARN_ON_ONCE(skb_headroom(skb) < sizeof(struct can_skb_priv))) return false; /* af_packet does not apply CAN skb specific settings */ if (skb->ip_summed == CHECKSUM_NONE) { /* init headroom */ can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb->ip_summed = CHECKSUM_UNNECESSARY; /* perform proper loopback on capable devices */ if (dev->flags & IFF_ECHO) skb->pkt_type = PACKET_LOOPBACK; else skb->pkt_type = PACKET_HOST; skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); /* set CANFD_FDF flag for CAN FD frames */ if (can_is_canfd_skb(skb)) { struct canfd_frame *cfd; cfd = (struct canfd_frame *)skb->data; cfd->flags |= CANFD_FDF; } } return true; } /* Drop a given socketbuffer if it does not contain a valid CAN frame. */ bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb) { switch (ntohs(skb->protocol)) { case ETH_P_CAN: if (!can_is_can_skb(skb)) goto inval_skb; break; case ETH_P_CANFD: if (!can_is_canfd_skb(skb)) goto inval_skb; break; case ETH_P_CANXL: if (!can_is_canxl_skb(skb)) goto inval_skb; break; default: goto inval_skb; } if (!can_skb_headroom_valid(dev, skb)) goto inval_skb; return false; inval_skb: kfree_skb(skb); dev->stats.tx_dropped++; return true; } EXPORT_SYMBOL_GPL(can_dropped_invalid_skb);
9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 // SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 /****************************************************************************** * * Module Name: exconvrt - Object conversion routines * * Copyright (C) 2000 - 2025, Intel Corp. * *****************************************************************************/ #include <acpi/acpi.h> #include "accommon.h" #include "acinterp.h" #include "amlcode.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exconvrt") /* Local prototypes */ static u32 acpi_ex_convert_to_ascii(u64 integer, u16 base, u8 *string, u8 max_length, u8 leading_zeros); /******************************************************************************* * * FUNCTION: acpi_ex_convert_to_integer * * PARAMETERS: obj_desc - Object to be converted. Must be an * Integer, Buffer, or String * result_desc - Where the new Integer object is returned * implicit_conversion - Used for string conversion * * RETURN: Status * * DESCRIPTION: Convert an ACPI Object to an integer. * ******************************************************************************/ acpi_status acpi_ex_convert_to_integer(union acpi_operand_object *obj_desc, union acpi_operand_object **result_desc, u32 implicit_conversion) { union acpi_operand_object *return_desc; u8 *pointer; u64 result; u32 i; u32 count; ACPI_FUNCTION_TRACE_PTR(ex_convert_to_integer, obj_desc); switch (obj_desc->common.type) { case ACPI_TYPE_INTEGER: /* No conversion necessary */ *result_desc = obj_desc; return_ACPI_STATUS(AE_OK); case ACPI_TYPE_BUFFER: case ACPI_TYPE_STRING: /* Note: Takes advantage of common buffer/string fields */ pointer = obj_desc->buffer.pointer; count = obj_desc->buffer.length; break; default: return_ACPI_STATUS(AE_TYPE); } /* * Convert the buffer/string to an integer. Note that both buffers and * strings are treated as raw data - we don't convert ascii to hex for * strings. * * There are two terminating conditions for the loop: * 1) The size of an integer has been reached, or * 2) The end of the buffer or string has been reached */ result = 0; /* String conversion is different than Buffer conversion */ switch (obj_desc->common.type) { case ACPI_TYPE_STRING: /* * Convert string to an integer - for most cases, the string must be * hexadecimal as per the ACPI specification. The only exception (as * of ACPI 3.0) is that the to_integer() operator allows both decimal * and hexadecimal strings (hex prefixed with "0x"). * * Explicit conversion is used only by to_integer. * All other string-to-integer conversions are implicit conversions. */ if (implicit_conversion) { result = acpi_ut_implicit_strtoul64(ACPI_CAST_PTR (char, pointer)); } else { result = acpi_ut_explicit_strtoul64(ACPI_CAST_PTR (char, pointer)); } break; case ACPI_TYPE_BUFFER: /* Check for zero-length buffer */ if (!count) { return_ACPI_STATUS(AE_AML_BUFFER_LIMIT); } /* Transfer no more than an integer's worth of data */ if (count > acpi_gbl_integer_byte_width) { count = acpi_gbl_integer_byte_width; } /* * Convert buffer to an integer - we simply grab enough raw data * from the buffer to fill an integer */ for (i = 0; i < count; i++) { /* * Get next byte and shift it into the Result. * Little endian is used, meaning that the first byte of the buffer * is the LSB of the integer */ result |= (((u64) pointer[i]) << (i * 8)); } break; default: /* No other types can get here */ break; } /* Create a new integer */ return_desc = acpi_ut_create_integer_object(result); if (!return_desc) { return_ACPI_STATUS(AE_NO_MEMORY); } ACPI_DEBUG_PRINT((ACPI_DB_EXEC, "Converted value: %8.8X%8.8X\n", ACPI_FORMAT_UINT64(result))); /* Save the Result */ (void)acpi_ex_truncate_for32bit_table(return_desc); *result_desc = return_desc; return_ACPI_STATUS(AE_OK); } /******************************************************************************* * * FUNCTION: acpi_ex_convert_to_buffer * * PARAMETERS: obj_desc - Object to be converted. Must be an * Integer, Buffer, or String * result_desc - Where the new buffer object is returned * * RETURN: Status * * DESCRIPTION: Convert an ACPI Object to a Buffer * ******************************************************************************/ acpi_status acpi_ex_convert_to_buffer(union acpi_operand_object *obj_desc, union acpi_operand_object **result_desc) { union acpi_operand_object *return_desc; u8 *new_buf; ACPI_FUNCTION_TRACE_PTR(ex_convert_to_buffer, obj_desc); switch (obj_desc->common.type) { case ACPI_TYPE_BUFFER: /* No conversion necessary */ *result_desc = obj_desc; return_ACPI_STATUS(AE_OK); case ACPI_TYPE_INTEGER: /* * Create a new Buffer object. * Need enough space for one integer */ return_desc = acpi_ut_create_buffer_object(acpi_gbl_integer_byte_width); if (!return_desc) { return_ACPI_STATUS(AE_NO_MEMORY); } /* Copy the integer to the buffer, LSB first */ new_buf = return_desc->buffer.pointer; memcpy(new_buf, &obj_desc->integer.value, acpi_gbl_integer_byte_width); break; case ACPI_TYPE_STRING: /* * Create a new Buffer object * Size will be the string length * * NOTE: Add one to the string length to include the null terminator. * The ACPI spec is unclear on this subject, but there is existing * ASL/AML code that depends on the null being transferred to the new * buffer. */ return_desc = acpi_ut_create_buffer_object((acpi_size) obj_desc->string. length + 1); if (!return_desc) { return_ACPI_STATUS(AE_NO_MEMORY); } /* Copy the string to the buffer */ new_buf = return_desc->buffer.pointer; memcpy((char *)new_buf, (char *)obj_desc->string.pointer, obj_desc->string.length); break; default: return_ACPI_STATUS(AE_TYPE); } /* Mark buffer initialized */ return_desc->common.flags |= AOPOBJ_DATA_VALID; *result_desc = return_desc; return_ACPI_STATUS(AE_OK); } /******************************************************************************* * * FUNCTION: acpi_ex_convert_to_ascii * * PARAMETERS: integer - Value to be converted * base - ACPI_STRING_DECIMAL or ACPI_STRING_HEX * string - Where the string is returned * data_width - Size of data item to be converted, in bytes * leading_zeros - Allow leading zeros * * RETURN: Actual string length * * DESCRIPTION: Convert an ACPI Integer to a hex or decimal string * ******************************************************************************/ static u32 acpi_ex_convert_to_ascii(u64 integer, u16 base, u8 *string, u8 data_width, u8 leading_zeros) { u64 digit; u32 i; u32 j; u32 k = 0; u32 hex_length; u32 decimal_length; u32 remainder; u8 supress_zeros = !leading_zeros; u8 hex_char; ACPI_FUNCTION_ENTRY(); switch (base) { case 10: /* Setup max length for the decimal number */ switch (data_width) { case 1: decimal_length = ACPI_MAX8_DECIMAL_DIGITS; break; case 4: decimal_length = ACPI_MAX32_DECIMAL_DIGITS; break; case 8: default: decimal_length = ACPI_MAX64_DECIMAL_DIGITS; break; } remainder = 0; for (i = decimal_length; i > 0; i--) { /* Divide by nth factor of 10 */ digit = integer; for (j = 0; j < i; j++) { (void)acpi_ut_short_divide(digit, 10, &digit, &remainder); } /* Handle leading zeros */ if (remainder != 0) { supress_zeros = FALSE; } if (!supress_zeros) { string[k] = (u8) (ACPI_ASCII_ZERO + remainder); k++; } } break; case 16: /* hex_length: 2 ascii hex chars per data byte */ hex_length = (data_width * 2); for (i = 0, j = (hex_length - 1); i < hex_length; i++, j--) { /* Get one hex digit, most significant digits first */ hex_char = (u8) acpi_ut_hex_to_ascii_char(integer, ACPI_MUL_4(j)); /* Supress leading zeros until the first non-zero character */ if (hex_char == ACPI_ASCII_ZERO && supress_zeros) { continue; } supress_zeros = FALSE; string[k] = hex_char; k++; } break; default: return (0); } /* * Since leading zeros are suppressed, we must check for the case where * the integer equals 0 * * Finally, null terminate the string and return the length */ if (!k) { string[0] = ACPI_ASCII_ZERO; k = 1; } string[k] = 0; return ((u32) k); } /******************************************************************************* * * FUNCTION: acpi_ex_convert_to_string * * PARAMETERS: obj_desc - Object to be converted. Must be an * Integer, Buffer, or String * result_desc - Where the string object is returned * type - String flags (base and conversion type) * * RETURN: Status * * DESCRIPTION: Convert an ACPI Object to a string. Supports both implicit * and explicit conversions and related rules. * ******************************************************************************/ acpi_status acpi_ex_convert_to_string(union acpi_operand_object * obj_desc, union acpi_operand_object ** result_desc, u32 type) { union acpi_operand_object *return_desc; u8 *new_buf; u32 i; u32 string_length = 0; u16 base = 16; u8 separator = ','; u8 leading_zeros; ACPI_FUNCTION_TRACE_PTR(ex_convert_to_string, obj_desc); switch (obj_desc->common.type) { case ACPI_TYPE_STRING: /* No conversion necessary */ *result_desc = obj_desc; return_ACPI_STATUS(AE_OK); case ACPI_TYPE_INTEGER: switch (type) { case ACPI_EXPLICIT_CONVERT_DECIMAL: /* * From to_decimal_string, integer source. * * Make room for the maximum decimal number size */ string_length = ACPI_MAX_DECIMAL_DIGITS; leading_zeros = FALSE; base = 10; break; case ACPI_EXPLICIT_CONVERT_HEX: /* * From to_hex_string. * * Supress leading zeros and append "0x" */ string_length = ACPI_MUL_2(acpi_gbl_integer_byte_width) + 2; leading_zeros = FALSE; break; default: /* Two hex string characters for each integer byte */ string_length = ACPI_MUL_2(acpi_gbl_integer_byte_width); leading_zeros = TRUE; break; } /* * Create a new String * Need enough space for one ASCII integer (plus null terminator) */ return_desc = acpi_ut_create_string_object((acpi_size)string_length); if (!return_desc) { return_ACPI_STATUS(AE_NO_MEMORY); } new_buf = return_desc->buffer.pointer; if (type == ACPI_EXPLICIT_CONVERT_HEX) { /* Append "0x" prefix for explicit hex conversion */ *new_buf++ = '0'; *new_buf++ = 'x'; } /* Convert integer to string */ string_length = acpi_ex_convert_to_ascii(obj_desc->integer.value, base, new_buf, acpi_gbl_integer_byte_width, leading_zeros); /* Null terminate at the correct place */ return_desc->string.length = string_length; if (type == ACPI_EXPLICIT_CONVERT_HEX) { /* Take "0x" prefix into account */ return_desc->string.length += 2; } new_buf[string_length] = 0; break; case ACPI_TYPE_BUFFER: /* Setup string length, base, and separator */ switch (type) { case ACPI_EXPLICIT_CONVERT_DECIMAL: /* Used by to_decimal_string */ /* * Explicit conversion from the to_decimal_string ASL operator. * * From ACPI: "If the input is a buffer, it is converted to a * a string of decimal values separated by commas." */ leading_zeros = FALSE; base = 10; /* * Calculate the final string length. Individual string values * are variable length (include separator for each) */ for (i = 0; i < obj_desc->buffer.length; i++) { if (obj_desc->buffer.pointer[i] >= 100) { string_length += 4; } else if (obj_desc->buffer.pointer[i] >= 10) { string_length += 3; } else { string_length += 2; } } break; case ACPI_IMPLICIT_CONVERT_HEX: /* * Implicit buffer-to-string conversion * * From the ACPI spec: * "The entire contents of the buffer are converted to a string of * two-character hexadecimal numbers, each separated by a space." * * Each hex number is prefixed with 0x (11/2018) */ leading_zeros = TRUE; separator = ' '; string_length = (obj_desc->buffer.length * 5); break; case ACPI_EXPLICIT_CONVERT_HEX: /* * Explicit conversion from the to_hex_string ASL operator. * * From ACPI: "If Data is a buffer, it is converted to a string of * hexadecimal values separated by commas." * * Each hex number is prefixed with 0x (11/2018) */ leading_zeros = TRUE; separator = ','; string_length = (obj_desc->buffer.length * 5); break; default: return_ACPI_STATUS(AE_BAD_PARAMETER); } /* * Create a new string object and string buffer * (-1 because of extra separator included in string_length from above) * Allow creation of zero-length strings from zero-length buffers. */ if (string_length) { string_length--; } return_desc = acpi_ut_create_string_object((acpi_size)string_length); if (!return_desc) { return_ACPI_STATUS(AE_NO_MEMORY); } new_buf = return_desc->buffer.pointer; /* * Convert buffer bytes to hex or decimal values * (separated by commas or spaces) */ for (i = 0; i < obj_desc->buffer.length; i++) { if (base == 16) { /* Emit 0x prefix for explicit/implicit hex conversion */ *new_buf++ = '0'; *new_buf++ = 'x'; } new_buf += acpi_ex_convert_to_ascii((u64) obj_desc-> buffer.pointer[i], base, new_buf, 1, leading_zeros); /* Each digit is separated by either a comma or space */ *new_buf++ = separator; } /* * Null terminate the string * (overwrites final comma/space from above) */ if (obj_desc->buffer.length) { new_buf--; } *new_buf = 0; break; default: return_ACPI_STATUS(AE_TYPE); } *result_desc = return_desc; return_ACPI_STATUS(AE_OK); } /******************************************************************************* * * FUNCTION: acpi_ex_convert_to_target_type * * PARAMETERS: destination_type - Current type of the destination * source_desc - Source object to be converted. * result_desc - Where the converted object is returned * walk_state - Current method state * * RETURN: Status * * DESCRIPTION: Implements "implicit conversion" rules for storing an object. * ******************************************************************************/ acpi_status acpi_ex_convert_to_target_type(acpi_object_type destination_type, union acpi_operand_object *source_desc, union acpi_operand_object **result_desc, struct acpi_walk_state *walk_state) { acpi_status status = AE_OK; ACPI_FUNCTION_TRACE(ex_convert_to_target_type); /* Default behavior */ *result_desc = source_desc; /* * If required by the target, * perform implicit conversion on the source before we store it. */ switch (GET_CURRENT_ARG_TYPE(walk_state->op_info->runtime_args)) { case ARGI_SIMPLE_TARGET: case ARGI_FIXED_TARGET: case ARGI_INTEGER_REF: /* Handles Increment, Decrement cases */ switch (destination_type) { case ACPI_TYPE_LOCAL_REGION_FIELD: /* * Named field can always handle conversions */ break; default: /* No conversion allowed for these types */ if (destination_type != source_desc->common.type) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Explicit operator, will store (%s) over existing type (%s)\n", acpi_ut_get_object_type_name (source_desc), acpi_ut_get_type_name (destination_type))); status = AE_TYPE; } } break; case ARGI_TARGETREF: case ARGI_STORE_TARGET: switch (destination_type) { case ACPI_TYPE_INTEGER: case ACPI_TYPE_BUFFER_FIELD: case ACPI_TYPE_LOCAL_BANK_FIELD: case ACPI_TYPE_LOCAL_INDEX_FIELD: /* * These types require an Integer operand. We can convert * a Buffer or a String to an Integer if necessary. */ status = acpi_ex_convert_to_integer(source_desc, result_desc, ACPI_IMPLICIT_CONVERSION); break; case ACPI_TYPE_STRING: /* * The operand must be a String. We can convert an * Integer or Buffer if necessary */ status = acpi_ex_convert_to_string(source_desc, result_desc, ACPI_IMPLICIT_CONVERT_HEX); break; case ACPI_TYPE_BUFFER: /* * The operand must be a Buffer. We can convert an * Integer or String if necessary */ status = acpi_ex_convert_to_buffer(source_desc, result_desc); break; default: ACPI_ERROR((AE_INFO, "Bad destination type during conversion: 0x%X", destination_type)); status = AE_AML_INTERNAL; break; } break; case ARGI_REFERENCE: /* * create_xxxx_field cases - we are storing the field object into the name */ break; default: ACPI_ERROR((AE_INFO, "Unknown Target type ID 0x%X AmlOpcode 0x%X DestType %s", GET_CURRENT_ARG_TYPE(walk_state->op_info-> runtime_args), walk_state->opcode, acpi_ut_get_type_name(destination_type))); status = AE_AML_INTERNAL; } /* * Source-to-Target conversion semantics: * * If conversion to the target type cannot be performed, then simply * overwrite the target with the new object and type. */ if (status == AE_TYPE) { status = AE_OK; } return_ACPI_STATUS(status); }
232 23 145 23 130 130 96 95 96 96 179 179 179 179 179 179 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _TCP_ECN_H #define _TCP_ECN_H #include <linux/tcp.h> #include <linux/skbuff.h> #include <linux/bitfield.h> #include <net/inet_connection_sock.h> #include <net/sock.h> #include <net/tcp.h> #include <net/inet_ecn.h> /* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is * attemped to be negotiated and requested for incoming connection * and outgoing connection, respectively. */ enum tcp_ecn_mode { TCP_ECN_IN_NOECN_OUT_NOECN = 0, TCP_ECN_IN_ECN_OUT_ECN = 1, TCP_ECN_IN_ECN_OUT_NOECN = 2, TCP_ECN_IN_ACCECN_OUT_ACCECN = 3, TCP_ECN_IN_ACCECN_OUT_ECN = 4, TCP_ECN_IN_ACCECN_OUT_NOECN = 5, }; /* AccECN option sending when AccECN has been successfully negotiated */ enum tcp_accecn_option { TCP_ACCECN_OPTION_DISABLED = 0, TCP_ACCECN_OPTION_MINIMUM = 1, TCP_ACCECN_OPTION_FULL = 2, }; static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { /* Do not set CWR if in AccECN mode! */ if (tcp_ecn_mode_rfc3168(tp)) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } static inline void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) { tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; /* If the sender is telling us it has entered CWR, then its * cwnd may be very low (even just 1 packet), so we should ACK * immediately. */ if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } } static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } /* tp->accecn_fail_mode */ #define TCP_ACCECN_ACE_FAIL_SEND BIT(0) #define TCP_ACCECN_ACE_FAIL_RECV BIT(1) #define TCP_ACCECN_OPT_FAIL_SEND BIT(2) #define TCP_ACCECN_OPT_FAIL_RECV BIT(3) static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp) { return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND; } static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp) { return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV; } static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp) { return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND; } static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp) { return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV; } static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) { tp->accecn_fail_mode |= mode; } #define TCP_ACCECN_OPT_NOT_SEEN 0x0 #define TCP_ACCECN_OPT_EMPTY_SEEN 0x1 #define TCP_ACCECN_OPT_COUNTER_SEEN 0x2 #define TCP_ACCECN_OPT_FAIL_SEEN 0x3 static inline u8 tcp_accecn_ace(const struct tcphdr *th) { return (th->ae << 2) | (th->cwr << 1) | th->ece; } /* Infer the ECT value our SYN arrived with from the echoed ACE field */ static inline int tcp_accecn_extract_syn_ect(u8 ace) { /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */ static const int ace_to_ecn[8] = { INET_ECN_ECT_0, /* 0b000 (Undefined) */ INET_ECN_ECT_1, /* 0b001 (Undefined) */ INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */ INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */ INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */ INET_ECN_ECT_1, /* 0b101 (Reserved) */ INET_ECN_CE, /* 0b110 (CE is received) */ INET_ECN_ECT_1 /* 0b111 (Undefined) */ }; return ace_to_ecn[ace & 0x7]; } /* Check ECN field transition to detect invalid transitions */ static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv) { if (rcv == snt) return true; /* Non-ECT altered to something or something became non-ECT */ if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT) return false; /* CE -> ECT(0/1)? */ if (snt == INET_ECN_CE) return false; return true; } static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, u8 sent_ect) { u8 ect = tcp_accecn_extract_syn_ect(ace); struct tcp_sock *tp = tcp_sk(sk); if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) return true; if (!tcp_ect_transition_valid(sent_ect, ect)) { tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); return false; } return true; } static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp, u8 saw_opt) { tp->saw_accecn_opt = saw_opt; if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); } /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ static inline void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, u8 sent_ect) { u8 ace = tcp_accecn_ace(tcp_hdr(skb)); struct tcp_sock *tp = tcp_sk(sk); switch (ace) { case 0x0: /* Invalid value */ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); break; case 0x7: case 0x5: case 0x1: /* Unused but legal values */ break; default: /* Validation only applies to first non-data packet */ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && !TCP_SKB_CB(skb)->sacked && tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) { if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) && !tp->delivered_ce) tp->delivered_ce++; } break; } } /* Demand the minimum # to send AccECN optnio */ static inline void tcp_accecn_opt_demand_min(struct sock *sk, u8 opt_demand_min) { struct tcp_sock *tp = tcp_sk(sk); u8 opt_demand; opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand); tp->accecn_opt_demand = opt_demand; } /* Maps IP ECN field ECT/CE code point to AccECN option field number, given * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). */ static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) { switch (ecnfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: return 0; /* AccECN does not send counts of NOT_ECT */ case INET_ECN_ECT_1: return 1; case INET_ECN_CE: return 2; case INET_ECN_ECT_0: return 3; } return 0; } /* Maps IP ECN field ECT/CE code point to AccECN option field value offset. * Some fields do not start from zero, to detect zeroing by middleboxes. */ static inline u32 tcp_accecn_field_init_offset(u8 ecnfield) { switch (ecnfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: return 0; /* AccECN does not send counts of NOT_ECT */ case INET_ECN_ECT_1: return TCP_ACCECN_E1B_INIT_OFFSET; case INET_ECN_CE: return TCP_ACCECN_CEB_INIT_OFFSET; case INET_ECN_ECT_0: return TCP_ACCECN_E0B_INIT_OFFSET; } return 0; } /* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option, bool order) { /* Based on Table 5 of the AccECN spec to map (option, order) to * the corresponding ECN conuters (ECT-1, ECT-0, or CE). */ static const u8 optfield_lookup[2][3] = { /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */ { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 }, /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */ { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 } }; return optfield_lookup[order][option % 3]; } /* Handles AccECN option ECT and CE 24-bit byte counters update into * the u32 value in tcp_sock. As we're processing TCP options, it is * safe to access from - 1. */ static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, u32 init_offset) { u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & 0xFFFFFFU; u32 delta = (truncated - *cnt) & 0xFFFFFFU; /* If delta has the highest bit set (24th bit) indicating * negative, sign extend to correct an estimation using * sign_extend32(delta, 24 - 1) */ delta = sign_extend32(delta, 23); *cnt += delta; return (s32)delta; } /* Updates Accurate ECN received counters from the received IP ECN field */ static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, u32 len) { u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); struct tcp_sock *tp = tcp_sk(sk); bool ecn_edge; if (!INET_ECN_is_not_ect(ecnfield)) { u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); /* As for accurate ECN, the TCP_ECN_SEEN flag is set by * tcp_ecn_received_counters() when the ECN codepoint of * received TCP data or ACK contains ECT(0), ECT(1), or CE. */ if (!tcp_ecn_mode_rfc3168(tp)) tp->ecn_flags |= TCP_ECN_SEEN; /* ACE counter tracks *all* segments including pure ACKs */ tp->received_ce += pcount; tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); if (len > 0) { u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1]; u32 bytes_mask = GENMASK_U32(31, 22); tp->received_ecn_bytes[ecnfield - 1] += len; tp->accecn_minlen = max_t(u8, tp->accecn_minlen, minlen); /* Send AccECN option at least once per 2^22-byte * increase in any ECN byte counter. */ if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) & bytes_mask) { tcp_accecn_opt_demand_min(sk, 1); } } } ecn_edge = tp->prev_ecnfield != ecnfield; if (ecn_edge || is_ce) { tp->prev_ecnfield = ecnfield; /* Demand Accurate ECN change-triggered ACKs. Two ACK are * demanded to indicate unambiguously the ecnfield value * in the latter ACK. */ if (tcp_ecn_mode_accecn(tp)) { if (ecn_edge) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; tp->accecn_opt_demand = 2; } } } /* AccECN specification, 2.2: [...] A Data Receiver maintains four counters * initialized at the start of the half-connection. [...] These byte counters * reflect only the TCP payload length, excluding TCP header and TCP options. */ static inline void tcp_ecn_received_counters_payload(struct sock *sk, const struct sk_buff *skb) { const struct tcphdr *th = (const struct tcphdr *)skb->data; tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); } /* AccECN specification, 5.1: [...] a server can determine that it * negotiated AccECN as [...] if the ACK contains an ACE field with * the value 0b010 to 0b111 (decimal 2 to 7). */ static inline bool cookie_accecn_ok(const struct tcphdr *th) { return tcp_accecn_ace(th) > 0x1; } /* Used to form the ACE flags for SYN/ACK */ static inline u16 tcp_accecn_reflector_flags(u8 ect) { /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN. * Below is an excerpt from the 1st block of Table 2 of AccECN spec, * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE */ static const u8 ecn_to_ace_flags[4] = { 0b010, /* Not-ECT is received */ 0b011, /* ECT(1) is received */ 0b100, /* ECT(0) is received */ 0b110 /* CE is received */ }; return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]); } /* AccECN specification, 3.1.2: If a TCP server that implements AccECN * receives a SYN with the three TCP header flags (AE, CWR and ECE) set * to any combination other than 000, 011 or 111, it MUST negotiate the * use of AccECN as if they had been set to 111. */ static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) { u8 ace = tcp_accecn_ace(th); return ace && ace != 0x3; } static inline void __tcp_accecn_init_bytes_counters(int *counter_array) { BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1); BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2); BUILD_BUG_ON(INET_ECN_CE != 0x3); counter_array[INET_ECN_ECT_1 - 1] = 0; counter_array[INET_ECN_ECT_0 - 1] = 0; counter_array[INET_ECN_CE - 1] = 0; } static inline void tcp_accecn_init_counters(struct tcp_sock *tp) { tp->received_ce = 0; tp->received_ce_pending = 0; __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); tp->accecn_minlen = 0; tp->accecn_opt_demand = 0; tp->est_ecnfield = 0; } /* Used for make_synack to form the ACE flags */ static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) { /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received * from SYN. Below is an excerpt from Table 2 of the AccECN spec: * +====================+====================================+ * | IP-ECN codepoint | Respective ACE falgs on SYN/ACK | * | received on SYN | AE CWR ECE | * +====================+====================================+ * | Not-ECT | 0 1 0 | * | ECT(1) | 0 1 1 | * | ECT(0) | 1 0 0 | * | CE | 1 1 0 | * +====================+====================================+ */ th->ae = !!(ect & INET_ECN_ECT_0); th->cwr = ect != INET_ECN_ECT_0; th->ece = ect == INET_ECN_ECT_1; } static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, struct tcphdr *th) { u32 wire_ace; /* The final packet of the 3WHS or anything like it must reflect * the SYN/ACK ECT instead of putting CEP into ACE field, such * case show up in tcp_flags. */ if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) { wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; th->ece = !!(wire_ace & 0x1); th->cwr = !!(wire_ace & 0x2); th->ae = !!(wire_ace & 0x4); tp->received_ce_pending = 0; } } static inline u8 tcp_accecn_option_init(const struct sk_buff *skb, u8 opt_offset) { u8 *ptr = skb_transport_header(skb) + opt_offset; unsigned int optlen = ptr[1] - 2; if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) return TCP_ACCECN_OPT_FAIL_SEEN; ptr += 2; /* Detect option zeroing: an AccECN connection "MAY check that the * initial value of the EE0B field or the EE1B field is non-zero" */ if (optlen < TCPOLEN_ACCECN_PERFIELD) return TCP_ACCECN_OPT_EMPTY_SEEN; if (get_unaligned_be24(ptr) == 0) return TCP_ACCECN_OPT_FAIL_SEEN; if (optlen < TCPOLEN_ACCECN_PERFIELD * 3) return TCP_ACCECN_OPT_COUNTER_SEEN; ptr += TCPOLEN_ACCECN_PERFIELD * 2; if (get_unaligned_be24(ptr) == 0) return TCP_ACCECN_OPT_FAIL_SEEN; return TCP_ACCECN_OPT_COUNTER_SEEN; } /* See Table 2 of the AccECN draft */ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, const struct tcphdr *th, u8 ip_dsfield) { struct tcp_sock *tp = tcp_sk(sk); u8 ace = tcp_accecn_ace(th); switch (ace) { case 0x0: case 0x7: /* +========+========+============+=============+ * | A | B | SYN/ACK | Feedback | * | | | B->A | Mode of A | * | | | AE CWR ECE | | * +========+========+============+=============+ * | AccECN | No ECN | 0 0 0 | Not ECN | * | AccECN | Broken | 1 1 1 | Not ECN | * +========+========+============+=============+ */ tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); break; case 0x1: case 0x5: /* +========+========+============+=============+ * | A | B | SYN/ACK | Feedback | * | | | B->A | Mode of A | * | | | AE CWR ECE | | * +========+========+============+=============+ * | AccECN | Nonce | 1 0 1 | (Reserved) | * | AccECN | ECN | 0 0 1 | Classic ECN | * | Nonce | AccECN | 0 0 1 | Classic ECN | * | ECN | AccECN | 0 0 1 | Classic ECN | * +========+========+============+=============+ */ if (tcp_ecn_mode_pending(tp)) /* Downgrade from AccECN, or requested initially */ tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); break; default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; if (tp->rx_opt.accecn && tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); tcp_accecn_saw_opt_fail_recv(tp, saw_opt); tp->accecn_opt_demand = 2; } if (INET_ECN_is_ce(ip_dsfield) && tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt)) { tp->received_ce++; tp->received_ce_pending++; } break; } } static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, const struct sk_buff *skb) { if (tcp_ecn_mode_pending(tp)) { if (!tcp_accecn_syn_requested(th)) { /* Downgrade to classic ECN feedback */ tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); } else { tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; tp->prev_ecnfield = tp->syn_ect_rcv; tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); } } if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) return true; return false; } /* Packet ECN state for a SYN-ACK */ static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (tcp_ecn_disabled(tp)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; else if (tcp_ca_needs_ecn(sk) || tcp_bpf_ca_needs_ecn(sk)) INET_ECN_xmit(sk); if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) { TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; TCP_SKB_CB(skb)->tcp_flags |= tcp_accecn_reflector_flags(tp->syn_ect_rcv); tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; } } /* Packet ECN state for a SYN. */ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); bool use_ecn, use_accecn; u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn); use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN; use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN || tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN || tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn; if (!use_ecn) { const struct dst_entry *dst = __sk_dst_get(sk); if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) use_ecn = true; } tp->ecn_flags = 0; if (use_ecn) { if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; if (use_accecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE; tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING); tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; } else { tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); } } } static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) { if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) { /* tp->ecn_flags are cleared at a later point in time when * SYN ACK is ultimatively being received. */ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; } } static inline void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { if (tcp_rsk(req)->accecn_ok) tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); else if (inet_rsk(req)->ecn_ok) th->ece = 1; } static inline bool tcp_accecn_option_beacon_check(const struct sock *sk) { u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon); const struct tcp_sock *tp = tcp_sk(sk); if (!ecn_beacon) return false; return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >= (tp->srtt_us >> 3); } #endif /* _LINUX_TCP_ECN_H */
22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 /* SPDX-License-Identifier: GPL-2.0 */ /* * Task I/O accounting operations */ #ifndef __TASK_IO_ACCOUNTING_OPS_INCLUDED #define __TASK_IO_ACCOUNTING_OPS_INCLUDED #include <linux/sched.h> #ifdef CONFIG_TASK_IO_ACCOUNTING static inline void task_io_account_read(size_t bytes) { current->ioac.read_bytes += bytes; } /* * We approximate number of blocks, because we account bytes only. * A 'block' is 512 bytes */ static inline unsigned long task_io_get_inblock(const struct task_struct *p) { return p->ioac.read_bytes >> 9; } static inline void task_io_account_write(size_t bytes) { current->ioac.write_bytes += bytes; } /* * We approximate number of blocks, because we account bytes only. * A 'block' is 512 bytes */ static inline unsigned long task_io_get_oublock(const struct task_struct *p) { return p->ioac.write_bytes >> 9; } static inline void task_io_account_cancelled_write(size_t bytes) { current->ioac.cancelled_write_bytes += bytes; } static inline void task_io_accounting_init(struct task_io_accounting *ioac) { memset(ioac, 0, sizeof(*ioac)); } static inline void task_blk_io_accounting_add(struct task_io_accounting *dst, struct task_io_accounting *src) { dst->read_bytes += src->read_bytes; dst->write_bytes += src->write_bytes; dst->cancelled_write_bytes += src->cancelled_write_bytes; } #else static inline void task_io_account_read(size_t bytes) { } static inline unsigned long task_io_get_inblock(const struct task_struct *p) { return 0; } static inline void task_io_account_write(size_t bytes) { } static inline unsigned long task_io_get_oublock(const struct task_struct *p) { return 0; } static inline void task_io_account_cancelled_write(size_t bytes) { } static inline void task_io_accounting_init(struct task_io_accounting *ioac) { } static inline void task_blk_io_accounting_add(struct task_io_accounting *dst, struct task_io_accounting *src) { } #endif /* CONFIG_TASK_IO_ACCOUNTING */ #ifdef CONFIG_TASK_XACCT static inline void task_chr_io_accounting_add(struct task_io_accounting *dst, struct task_io_accounting *src) { dst->rchar += src->rchar; dst->wchar += src->wchar; dst->syscr += src->syscr; dst->syscw += src->syscw; } #else static inline void task_chr_io_accounting_add(struct task_io_accounting *dst, struct task_io_accounting *src) { } #endif /* CONFIG_TASK_XACCT */ static inline void task_io_accounting_add(struct task_io_accounting *dst, struct task_io_accounting *src) { task_chr_io_accounting_add(dst, src); task_blk_io_accounting_add(dst, src); } #endif /* __TASK_IO_ACCOUNTING_OPS_INCLUDED */
79 79 2 77 79 71 80 80 80 80 80 78 79 3 79 79 75 7 92 90 91 90 90 91 78 90 86 1 8 80 80 80 77 75 77 84 84 89 99 98 97 93 2 92 26 94 97 85 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 // SPDX-License-Identifier: GPL-2.0-or-later /* * linux/mm/process_vm_access.c * * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp. */ #include <linux/compat.h> #include <linux/mm.h> #include <linux/uio.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/highmem.h> #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/syscalls.h> /** * process_vm_rw_pages - read/write pages from task specified * @pages: array of pointers to pages we want to copy * @offset: offset in page to start copying from/to * @len: number of bytes to copy * @iter: where to copy to/from locally * @vm_write: 0 means copy from, 1 means copy to * Returns 0 on success, error code otherwise */ static int process_vm_rw_pages(struct page **pages, unsigned offset, size_t len, struct iov_iter *iter, int vm_write) { /* Do the copy for each page */ while (len && iov_iter_count(iter)) { struct page *page = *pages++; size_t copy = PAGE_SIZE - offset; size_t copied; if (copy > len) copy = len; if (vm_write) copied = copy_page_from_iter(page, offset, copy, iter); else copied = copy_page_to_iter(page, offset, copy, iter); len -= copied; if (copied < copy && iov_iter_count(iter)) return -EFAULT; offset = 0; } return 0; } /* Maximum number of pages kmalloc'd to hold struct page's during copy */ #define PVM_MAX_KMALLOC_PAGES 2 /* Maximum number of pages that can be stored at a time */ #define PVM_MAX_USER_PAGES (PVM_MAX_KMALLOC_PAGES * PAGE_SIZE / sizeof(struct page *)) /** * process_vm_rw_single_vec - read/write pages from task specified * @addr: start memory address of target process * @len: size of area to copy to/from * @iter: where to copy to/from locally * @process_pages: struct pages area that can store at least * nr_pages_to_copy struct page pointers * @mm: mm for task * @task: task to read/write from * @vm_write: 0 means copy from, 1 means copy to * Returns 0 on success or on failure error code */ static int process_vm_rw_single_vec(unsigned long addr, unsigned long len, struct iov_iter *iter, struct page **process_pages, struct mm_struct *mm, struct task_struct *task, int vm_write) { unsigned long pa = addr & PAGE_MASK; unsigned long start_offset = addr - pa; unsigned long nr_pages; ssize_t rc = 0; unsigned int flags = 0; /* Work out address and page range required */ if (len == 0) return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; if (vm_write) flags |= FOLL_WRITE; while (!rc && nr_pages && iov_iter_count(iter)) { int pinned_pages = min_t(unsigned long, nr_pages, PVM_MAX_USER_PAGES); int locked = 1; size_t bytes; /* * Get the pages we're interested in. We must * access remotely because task/mm might not * current/current->mm */ mmap_read_lock(mm); pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages, flags, process_pages, &locked); if (locked) mmap_read_unlock(mm); if (pinned_pages <= 0) return -EFAULT; bytes = pinned_pages * PAGE_SIZE - start_offset; if (bytes > len) bytes = len; rc = process_vm_rw_pages(process_pages, start_offset, bytes, iter, vm_write); len -= bytes; start_offset = 0; nr_pages -= pinned_pages; pa += pinned_pages * PAGE_SIZE; /* If vm_write is set, the pages need to be made dirty: */ unpin_user_pages_dirty_lock(process_pages, pinned_pages, vm_write); } return rc; } /* Maximum number of entries for process pages array which lives on stack */ #define PVM_MAX_PP_ARRAY_COUNT 16 /** * process_vm_rw_core - core of reading/writing pages from task specified * @pid: PID of process to read/write from/to * @iter: where to copy to/from locally * @rvec: iovec array specifying where to copy to/from in the other process * @riovcnt: size of rvec array * @flags: currently unused * @vm_write: 0 if reading from other process, 1 if writing to other process * * Returns the number of bytes read/written or error code. May * return less bytes than expected if an error occurs during the copying * process. */ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, const struct iovec *rvec, unsigned long riovcnt, unsigned long flags, int vm_write) { struct task_struct *task; struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT]; struct page **process_pages = pp_stack; struct mm_struct *mm; unsigned long i; ssize_t rc = 0; unsigned long nr_pages = 0; unsigned long nr_pages_iov; ssize_t iov_len; size_t total_len = iov_iter_count(iter); /* * Work out how many pages of struct pages we're going to need * when eventually calling get_user_pages */ for (i = 0; i < riovcnt; i++) { iov_len = rvec[i].iov_len; if (iov_len > 0) { nr_pages_iov = ((unsigned long)rvec[i].iov_base + iov_len - 1) / PAGE_SIZE - (unsigned long)rvec[i].iov_base / PAGE_SIZE + 1; nr_pages = max(nr_pages, nr_pages_iov); } } if (nr_pages == 0) return 0; if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) { /* For reliability don't try to kmalloc more than 2 pages worth */ process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES * PAGE_SIZE, sizeof(struct page *)*nr_pages), GFP_KERNEL); if (!process_pages) return -ENOMEM; } /* Get process information */ task = find_get_task_by_vpid(pid); if (!task) { rc = -ESRCH; goto free_proc_pages; } mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); if (IS_ERR(mm)) { rc = PTR_ERR(mm); /* * Explicitly map EACCES to EPERM as EPERM is a more * appropriate error code for process_vw_readv/writev */ if (rc == -EACCES) rc = -EPERM; goto put_task_struct; } for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++) rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, iter, process_pages, mm, task, vm_write); /* copied = space before - space after */ total_len -= iov_iter_count(iter); /* If we have managed to copy any data at all then we return the number of bytes copied. Otherwise we return the error code */ if (total_len) rc = total_len; mmput(mm); put_task_struct: put_task_struct(task); free_proc_pages: if (process_pages != pp_stack) kfree(process_pages); return rc; } /** * process_vm_rw - check iovecs before calling core routine * @pid: PID of process to read/write from/to * @lvec: iovec array specifying where to copy to/from locally * @liovcnt: size of lvec array * @rvec: iovec array specifying where to copy to/from in the other process * @riovcnt: size of rvec array * @flags: currently unused * @vm_write: 0 if reading from other process, 1 if writing to other process * * Returns the number of bytes read/written or error code. May * return less bytes than expected if an error occurs during the copying * process. */ static ssize_t process_vm_rw(pid_t pid, const struct iovec __user *lvec, unsigned long liovcnt, const struct iovec __user *rvec, unsigned long riovcnt, unsigned long flags, int vm_write) { struct iovec iovstack_l[UIO_FASTIOV]; struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r; struct iov_iter iter; ssize_t rc; int dir = vm_write ? ITER_SOURCE : ITER_DEST; if (flags != 0) return -EINVAL; /* Check iovecs */ rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter); if (rc < 0) return rc; if (!iov_iter_count(&iter)) goto free_iov_l; iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r, in_compat_syscall()); if (IS_ERR(iov_r)) { rc = PTR_ERR(iov_r); goto free_iov_l; } rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); if (iov_r != iovstack_r) kfree(iov_r); free_iov_l: kfree(iov_l); return rc; } SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec, unsigned long, liovcnt, const struct iovec __user *, rvec, unsigned long, riovcnt, unsigned long, flags) { return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); } SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, const struct iovec __user *, lvec, unsigned long, liovcnt, const struct iovec __user *, rvec, unsigned long, riovcnt, unsigned long, flags) { return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); }
16 28 31 27 23 24 8 38 33 6 15 11 3 3 25 7 5 5 3 25 8 8 8 3 8 10 11 11 3 3 2 11 11 1 4 3 2 2 9 9 8 12 12 162 170 167 168 166 7676 7657 7687 6697 3 2 6688 7 7672 7670 7754 1337 1352 1340 1211 1315 1 1 360 362 350 1 349 348 296 296 296 349 361 77 78 2539 2529 2561 2527 2539 1247 1344 1352 2463 1725 1708 2458 2466 2293 2308 2301 2155 835 1376 3423 3417 3455 3420 3076 3417 1961 3420 1243 2291 2308 1739 1722 3067 1886 4862 2030 2025 1876 141 1979 1978 2011 3228 3213 2337 904 2865 2973 3207 559 551 561 561 241 238 242 242 571 575 70 570 307 261 543 543 553 686 679 680 368 320 374 468 468 296 631 629 631 626 623 621 308 369 507 462 594 596 576 581 578 580 573 574 573 266 321 470 356 569 571 572 508 504 415 109 195 354 283 487 488 549 548 466 88 235 343 335 543 544 142 142 134 131 35 134 39 41 37 36 19 40 442 545 132 72 63 9 32 13 4 9 970 921 970 969 951 17 968 964 974 965 967 966 2 1 962 760 756 211 908 689 688 693 691 6 685 906 305 977 17 926 954 14 14 13 13 13 10 10 5 9 2 8 14 16 17 17 7 7 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 3 21 20 20 20 18 4 14 19 7 12 18 17 1 1 1 1 172 173 4 170 171 170 171 173 165 164 31 163 164 165 165 165 163 104 20 19 19 17 20 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/read_write.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/slab.h> #include <linux/stat.h> #include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> #include "internal.h" #include <linux/uaccess.h> #include <asm/unistd.h> const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .mmap_prepare = generic_file_readonly_mmap_prepare, .splice_read = filemap_splice_read, }; EXPORT_SYMBOL(generic_ro_fops); static inline bool unsigned_offsets(struct file *file) { return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET; } /** * vfs_setpos_cookie - update the file offset for lseek and reset cookie * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * @cookie: cookie to reset * * Update the file offset to the value specified by @offset if the given * offset is valid and it is not equal to the current file offset and * reset the specified cookie to indicate that a seek happened. * * Return the specified offset on success and -EINVAL on invalid offset. */ static loff_t vfs_setpos_cookie(struct file *file, loff_t offset, loff_t maxsize, u64 *cookie) { if (offset < 0 && !unsigned_offsets(file)) return -EINVAL; if (offset > maxsize) return -EINVAL; if (offset != file->f_pos) { file->f_pos = offset; if (cookie) *cookie = 0; } return offset; } /** * vfs_setpos - update the file offset for lseek * @file: file structure in question * @offset: file offset to seek to * @maxsize: maximum file size * * This is a low-level filesystem helper for updating the file offset to * the value specified by @offset if the given offset is valid and it is * not equal to the current file offset. * * Return the specified offset on success and -EINVAL on invalid offset. */ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) { return vfs_setpos_cookie(file, offset, maxsize, NULL); } EXPORT_SYMBOL(vfs_setpos); /** * must_set_pos - check whether f_pos has to be updated * @file: file to seek on * @offset: offset to use * @whence: type of seek operation * @eof: end of file * * Check whether f_pos needs to be updated and update @offset according * to @whence. * * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be * updated, and negative error code on failure. */ static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof) { switch (whence) { case SEEK_END: *offset += eof; break; case SEEK_CUR: /* * Here we special-case the lseek(fd, 0, SEEK_CUR) * position-querying operation. Avoid rewriting the "same" * f_pos value back to the file because a concurrent read(), * write() or lseek() might have altered it */ if (*offset == 0) { *offset = file->f_pos; return 0; } break; case SEEK_DATA: /* * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ if ((unsigned long long)*offset >= eof) return -ENXIO; break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ if ((unsigned long long)*offset >= eof) return -ENXIO; *offset = eof; break; } return 1; } /** * generic_file_llseek_size - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom * maximum file size and a custom EOF position, for e.g. hashed directories * * Synchronization: * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. * read/writes behave like SEEK_SET against seeks. */ loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof) { int ret; ret = must_set_pos(file, &offset, whence, eof); if (ret < 0) return ret; if (ret == 0) return offset; if (whence == SEEK_CUR) { /* * If the file requires locking via f_pos_lock we know * that mutual exclusion for SEEK_CUR on the same file * is guaranteed. If the file isn't locked, we take * f_lock to protect against f_pos races with other * SEEK_CURs. */ if (file_seek_cur_needs_f_lock(file)) { guard(spinlock)(&file->f_lock); return vfs_setpos(file, file->f_pos + offset, maxsize); } return vfs_setpos(file, file->f_pos + offset, maxsize); } return vfs_setpos(file, offset, maxsize); } EXPORT_SYMBOL(generic_file_llseek_size); /** * generic_llseek_cookie - versioned llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @cookie: cookie to update * * See generic_file_llseek for a general description and locking assumptions. * * In contrast to generic_file_llseek, this function also resets a * specified cookie to indicate a seek took place. */ loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, u64 *cookie) { struct inode *inode = file->f_mapping->host; loff_t maxsize = inode->i_sb->s_maxbytes; loff_t eof = i_size_read(inode); int ret; if (WARN_ON_ONCE(!cookie)) return -EINVAL; /* * Require that this is only used for directories that guarantee * synchronization between readdir and seek so that an update to * @cookie is correctly synchronized with concurrent readdir. */ if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS))) return -EINVAL; ret = must_set_pos(file, &offset, whence, eof); if (ret < 0) return ret; if (ret == 0) return offset; /* No need to hold f_lock because we know that f_pos_lock is held. */ if (whence == SEEK_CUR) return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie); return vfs_setpos_cookie(file, offset, maxsize, cookie); } EXPORT_SYMBOL(generic_llseek_cookie); /** * generic_file_llseek - generic llseek implementation for regular files * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is a generic implementation of ->llseek useable for all normal local * filesystems. It just updates the file offset to the value specified by * @offset and @whence. */ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; return generic_file_llseek_size(file, offset, whence, inode->i_sb->s_maxbytes, i_size_read(inode)); } EXPORT_SYMBOL(generic_file_llseek); /** * fixed_size_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: size of the file * */ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, size, size); default: return -EINVAL; } } EXPORT_SYMBOL(fixed_size_llseek); /** * no_seek_end_llseek - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * */ loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, OFFSET_MAX, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek); /** * no_seek_end_llseek_size - llseek implementation for fixed-sized devices * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * @size: maximal offset allowed * */ loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) { switch (whence) { case SEEK_SET: case SEEK_CUR: return generic_file_llseek_size(file, offset, whence, size, 0); default: return -EINVAL; } } EXPORT_SYMBOL(no_seek_end_llseek_size); /** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek * * This is an implementation of ->llseek useable for the rare special case when * userspace expects the seek to succeed but the (device) file is actually not * able to perform the seek. In this case you use noop_llseek() instead of * falling back to the default implementation of ->llseek. */ loff_t noop_llseek(struct file *file, loff_t offset, int whence) { return file->f_pos; } EXPORT_SYMBOL(noop_llseek); loff_t default_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t retval; retval = inode_lock_killable(inode); if (retval) return retval; switch (whence) { case SEEK_END: offset += i_size_read(inode); break; case SEEK_CUR: if (offset == 0) { retval = file->f_pos; goto out; } offset += file->f_pos; break; case SEEK_DATA: /* * In the generic case the entire file is data, so as * long as offset isn't at the end of the file then the * offset is data. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } break; case SEEK_HOLE: /* * There is a virtual hole at the end of the file, so * as long as offset isn't i_size or larger, return * i_size. */ if (offset >= inode->i_size) { retval = -ENXIO; goto out; } offset = inode->i_size; break; } retval = -EINVAL; if (offset >= 0 || unsigned_offsets(file)) { if (offset != file->f_pos) file->f_pos = offset; retval = offset; } out: inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); loff_t vfs_llseek(struct file *file, loff_t offset, int whence) { if (!(file->f_mode & FMODE_LSEEK)) return -ESPIPE; return file->f_op->llseek(file, offset, whence); } EXPORT_SYMBOL(vfs_llseek); static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; CLASS(fd_pos, f)(fd); if (fd_empty(f)) return -EBADF; retval = -EINVAL; if (whence <= SEEK_MAX) { loff_t res = vfs_llseek(fd_file(f), offset, whence); retval = res; if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } return retval; } SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) { return ksys_lseek(fd, offset, whence); } #endif #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) { int retval; CLASS(fd_pos, f)(fd); loff_t offset; if (fd_empty(f)) return -EBADF; if (whence > SEEK_MAX) return -EINVAL; offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low, whence); retval = (int)offset; if (offset >= 0) { retval = -EFAULT; if (!copy_to_user(result, &offset, sizeof(offset))) retval = 0; } return retval; } #endif int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { int mask = read_write == READ ? MAY_READ : MAY_WRITE; int ret; if (unlikely((ssize_t) count < 0)) return -EINVAL; if (ppos) { loff_t pos = *ppos; if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) return -EINVAL; } } ret = security_file_permission(file, mask); if (ret) return ret; return fsnotify_file_area_perm(file, mask, ppos, count); } EXPORT_SYMBOL(rw_verify_area); static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_DEST, buf, len); ret = filp->f_op->read_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } static int warn_unsupported(struct file *file, const char *op) { pr_warn_ratelimited( "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) return -EINVAL; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; /* * Also fail if ->read_iter and ->read are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->read_iter || file->f_op->read)) return warn_unsupported(file, "read"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); ret = file->f_op->read_iter(&kiocb, &iter); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; return __kernel_read(file, buf, count, pos); } EXPORT_SYMBOL(kernel_read); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; if (file->f_op->read) ret = file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) ret = new_sync_read(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); } inc_syscr(current); return ret; } static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = (ppos ? *ppos : 0); iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); ret = filp->f_op->write_iter(&kiocb, &iter); BUG_ON(ret == -EIOCBQUEUED); if (ret > 0 && ppos) *ppos = kiocb.ki_pos; return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) { struct kiocb kiocb; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; /* * Also fail if ->write_iter and ->write are both wired up as that * implies very convoluted semantics. */ if (unlikely(!file->f_op->write_iter || file->f_op->write)) return warn_unsupported(file, "write"); init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos ? *pos : 0; ret = file->f_op->write_iter(&kiocb, from); if (ret > 0) { if (pos) *pos = kiocb.ki_pos; fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); return ret; } /* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { struct kvec iov = { .iov_base = (void *)buf, .iov_len = min_t(size_t, count, MAX_RW_COUNT), }; struct iov_iter iter; iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); return __kernel_write_iter(file, &iter, pos); } /* * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", * but autofs is one of the few internal kernel users that actually * wants this _and_ can be built as a module. So we need to export * this symbol for autofs, even though it really isn't appropriate * for any other kernel modules. */ EXPORT_SYMBOL_GPL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { ssize_t ret; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; file_start_write(file); ret = __kernel_write(file, buf, count, pos); file_end_write(file); return ret; } EXPORT_SYMBOL(kernel_write); ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (unlikely(!access_ok(buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret) return ret; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; file_start_write(file); if (file->f_op->write) ret = file->f_op->write(file, buf, count, pos); else if (file->f_op->write_iter) ret = new_sync_write(file, buf, count, pos); else ret = -EINVAL; if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); return ret; } /* file_ppos returns &file->f_pos or NULL if file is stream */ static inline loff_t *file_ppos(struct file *file) { return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_read(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } return ret; } SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { return ksys_read(fd, buf, count); } ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_write(fd_file(f), buf, count, ppos); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } return ret; } SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { return ksys_write(fd, buf, count); } ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_PREAD) return vfs_read(fd_file(f), buf, count, &pos); return -ESPIPE; } SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, size_t, count, loff_t, pos) { return ksys_pread64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, size_t count, loff_t pos) { if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_PWRITE) return vfs_write(fd_file(f), buf, count, &pos); return -ESPIPE; } SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, loff_t, pos) { return ksys_pwrite64(fd, buf, count, pos); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, size_t, count, compat_arg_u64_dual(pos)) { return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); } #endif static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, filp); ret = kiocb_set_rw_flags(&kiocb, flags, type); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); if (type == READ) ret = filp->f_op->read_iter(&kiocb, iter); else ret = filp->f_op->write_iter(&kiocb, iter); BUG_ON(ret == -EIOCBQUEUED); if (ppos) *ppos = kiocb.ki_pos; return ret; } /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; if (flags & ~RWF_HIPRI) return -EOPNOTSUPP; while (iov_iter_count(iter)) { ssize_t nr; if (type == READ) { nr = filp->f_op->read(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } else { nr = filp->f_op->write(filp, iter_iov_addr(iter), iter_iov_len(iter), ppos); } if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } return ret; } ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; ret = file->f_op->read_iter(iocb, iter); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_read); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->read_iter) return -EINVAL; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, ppos, tot_len); if (ret < 0) return ret; ret = do_iter_readv_writev(file, iter, ppos, READ, flags); out: if (ret >= 0) fsnotify_access(file); return ret; } EXPORT_SYMBOL(vfs_iter_read); /* * Caller is responsible for calling kiocb_end_write() on completion * if async iocb was queued. */ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter) { size_t tot_len; ssize_t ret = 0; if (!file->f_op->write_iter) return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); if (ret < 0) return ret; kiocb_start_write(iocb); ret = file->f_op->write_iter(iocb, iter); if (ret != -EIOCBQUEUED) kiocb_end_write(iocb); if (ret > 0) fsnotify_modify(file); return ret; } EXPORT_SYMBOL(vfs_iocb_iter_write); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { size_t tot_len; ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!file->f_op->write_iter) return -EINVAL; tot_len = iov_iter_count(iter); if (!tot_len) return 0; ret = rw_verify_area(WRITE, file, ppos, tot_len); if (ret < 0) return ret; file_start_write(file); ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); return ret; } EXPORT_SYMBOL(vfs_iter_write); static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_READ)) return -EBADF; if (!(file->f_mode & FMODE_CAN_READ)) return -EINVAL; ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(READ, file, pos, tot_len); if (ret < 0) goto out; if (file->f_op->read_iter) ret = do_iter_readv_writev(file, &iter, pos, READ, flags); else ret = do_loop_readv_writev(file, &iter, pos, READ, flags); out: if (ret >= 0) fsnotify_access(file); kfree(iov); return ret; } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; size_t tot_len; ssize_t ret = 0; if (!(file->f_mode & FMODE_WRITE)) return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) return ret; tot_len = iov_iter_count(&iter); if (!tot_len) goto out; ret = rw_verify_area(WRITE, file, pos, tot_len); if (ret < 0) goto out; file_start_write(file); if (file->f_op->write_iter) ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags); else ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags); if (ret > 0) fsnotify_modify(file); file_end_write(file); out: kfree(iov); return ret; } static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, rwf_t flags) { CLASS(fd_pos, f)(fd); ssize_t ret = -EBADF; if (!fd_empty(f)) { loff_t pos, *ppos = file_ppos(fd_file(f)); if (ppos) { pos = *ppos; ppos = &pos; } ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags); if (ret >= 0 && ppos) fd_file(f)->f_pos = pos; } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) { #define HALF_LONG_BITS (BITS_PER_LONG / 2) return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PREAD) ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags); } if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen, loff_t pos, rwf_t flags) { ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; CLASS(fd, f)(fd); if (!fd_empty(f)) { ret = -ESPIPE; if (fd_file(f)->f_mode & FMODE_PWRITE) ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags); } if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_readv(fd, vec, vlen, 0); } SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { return do_writev(fd, vec, vlen, 0); } SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_preadv(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) { loff_t pos = pos_from_hilo(pos_h, pos_l); return do_pwritev(fd, vec, vlen, pos, 0); } SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } /* * Various compat syscalls. Note that they all pretend to take a native * iovec - import_iovec will properly treat those as compat_iovecs based on * in_compat_syscall(). */ #ifdef CONFIG_COMPAT #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_preadv(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_preadv(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct iovec __user *, vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_readv(fd, vec, vlen, flags); return do_preadv(fd, vec, vlen, pos, flags); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos) { return do_pwritev(fd, vec, vlen, pos, 0); } #endif COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return do_pwritev(fd, vec, vlen, pos, 0); } #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; if (pos == -1) return do_writev(fd, vec, vlen, flags); return do_pwritev(fd, vec, vlen, pos, flags); } #endif /* CONFIG_COMPAT */ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { struct inode *in_inode, *out_inode; struct pipe_inode_info *opipe; loff_t pos; loff_t out_pos; ssize_t retval; int fl; /* * Get input file, and verify that it is ok.. */ CLASS(fd, in)(in_fd); if (fd_empty(in)) return -EBADF; if (!(fd_file(in)->f_mode & FMODE_READ)) return -EBADF; if (!ppos) { pos = fd_file(in)->f_pos; } else { pos = *ppos; if (!(fd_file(in)->f_mode & FMODE_PREAD)) return -ESPIPE; } retval = rw_verify_area(READ, fd_file(in), &pos, count); if (retval < 0) return retval; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; /* * Get output file, and verify that it is ok.. */ CLASS(fd, out)(out_fd); if (fd_empty(out)) return -EBADF; if (!(fd_file(out)->f_mode & FMODE_WRITE)) return -EBADF; in_inode = file_inode(fd_file(in)); out_inode = file_inode(fd_file(out)); out_pos = fd_file(out)->f_pos; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); if (unlikely(pos + count > max)) { if (pos >= max) return -EOVERFLOW; count = max - pos; } fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (fd_file(in)->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif opipe = get_pipe_info(fd_file(out), true); if (!opipe) { retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count); if (retval < 0) return retval; retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos, count, fl); } else { if (fd_file(out)->f_flags & O_NONBLOCK) fl |= SPLICE_F_NONBLOCK; retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl); } if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(fd_file(in)); fsnotify_modify(fd_file(out)); fd_file(out)->f_pos = out_pos; if (ppos) *ppos = pos; else fd_file(in)->f_pos = pos; } inc_syscr(current); inc_syscw(current); if (pos > max) retval = -EOVERFLOW; return retval; } SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, compat_off_t __user *, offset, compat_size_t, count) { loff_t pos; off_t off; ssize_t ret; if (offset) { if (unlikely(get_user(off, offset))) return -EFAULT; pos = off; ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, compat_loff_t __user *, offset, compat_size_t, count) { loff_t pos; ssize_t ret; if (offset) { if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) return -EFAULT; ret = do_sendfile(out_fd, in_fd, &pos, count, 0); if (unlikely(put_user(pos, offset))) return -EFAULT; return ret; } return do_sendfile(out_fd, in_fd, NULL, count, 0); } #endif /* * Performs necessary checks before doing a file copy * * Can adjust amount of bytes to copy via @req_count argument. * Returns appropriate error code that caller should return or * zero in case the copy should be allowed. */ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t *req_count, unsigned int flags) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); uint64_t count = *req_count; loff_t size_in; int ret; ret = generic_file_rw_checks(file_in, file_out); if (ret) return ret; /* * We allow some filesystems to handle cross sb copy, but passing * a file of the wrong filesystem type to filesystem driver can result * in an attempt to dereference the wrong type of ->private_data, so * avoid doing that until we really have a good reason. * * nfs and cifs define several different file_system_type structures * and several different sets of file_operations, but they all end up * using the same ->copy_file_range() function pointer. */ if (flags & COPY_FILE_SPLICE) { /* cross sb splice is allowed */ } else if (file_out->f_op->copy_file_range) { if (file_in->f_op->copy_file_range != file_out->f_op->copy_file_range) return -EXDEV; } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { return -EXDEV; } /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) return -ETXTBSY; /* Ensure offsets don't wrap. */ if (pos_in + count < pos_in || pos_out + count < pos_out) return -EOVERFLOW; /* Shorten the copy to EOF */ size_in = i_size_read(inode_in); if (pos_in >= size_in) count = 0; else count = min(count, size_in - (uint64_t)pos_in); ret = generic_write_check_limits(file_out, pos_out, &count); if (ret) return ret; /* Don't allow overlapped copying within the same file. */ if (inode_in == inode_out && pos_out + count > pos_in && pos_out < pos_in + count) return -EINVAL; *req_count = count; return 0; } /* * copy_file_range() differs from regular file read and write in that it * specifically allows return partial success. When it does so is up to * the copy_file_range method. */ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { ssize_t ret; bool splice = flags & COPY_FILE_SPLICE; bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb; if (flags & ~COPY_FILE_SPLICE) return -EINVAL; ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, flags); if (unlikely(ret)) return ret; ret = rw_verify_area(READ, file_in, &pos_in, len); if (unlikely(ret)) return ret; ret = rw_verify_area(WRITE, file_out, &pos_out, len); if (unlikely(ret)) return ret; if (len == 0) return 0; /* * Make sure return value doesn't overflow in 32bit compat mode. Also * limit the size for all cases except when calling ->copy_file_range(). */ if (splice || !file_out->f_op->copy_file_range || in_compat_syscall()) len = min_t(size_t, MAX_RW_COUNT, len); file_start_write(file_out); /* * Cloning is supported by more file systems, so we implement copy on * same sb using clone, but for filesystems where both clone and copy * are supported (e.g. nfs,cifs), we only call the copy method. */ if (!splice && file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; } else if (samesb) { /* Fallback to splice for same sb copy for backward compat */ splice = true; } file_end_write(file_out); if (!splice) goto done; /* * We can get here for same sb copy of filesystems that do not implement * ->copy_file_range() in case filesystem does not support clone or in * case filesystem supports clone but rejected the clone request (e.g. * because it was not block aligned). * * In both cases, fall back to kernel copy so we are able to maintain a * consistent story about which filesystems support copy_file_range() * and which filesystems do not, that will allow userspace tools to * make consistent desicions w.r.t using copy_file_range(). * * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE * for server-side-copy between any two sb. * * In any case, we call do_splice_direct() and not splice_file_range(), * without file_start_write() held, to avoid possible deadlocks related * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0); done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } inc_syscr(current); inc_syscw(current); return ret; } EXPORT_SYMBOL(vfs_copy_file_range); SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { loff_t pos_in; loff_t pos_out; ssize_t ret = -EBADF; CLASS(fd, f_in)(fd_in); if (fd_empty(f_in)) return -EBADF; CLASS(fd, f_out)(fd_out); if (fd_empty(f_out)) return -EBADF; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) return -EFAULT; } else { pos_in = fd_file(f_in)->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) return -EFAULT; } else { pos_out = fd_file(f_out)->f_pos; } if (flags != 0) return -EINVAL; ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len, flags); if (ret > 0) { pos_in += ret; pos_out += ret; if (off_in) { if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) ret = -EFAULT; } else { fd_file(f_in)->f_pos = pos_in; } if (off_out) { if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) ret = -EFAULT; } else { fd_file(f_out)->f_pos = pos_out; } } return ret; } /* * Don't operate on ranges the page cache doesn't support, and don't exceed the * LFS limits. If pos is under the limit it becomes a short access. If it * exceeds the limit we return -EFBIG. */ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) { struct inode *inode = file->f_mapping->host; loff_t max_size = inode->i_sb->s_maxbytes; loff_t limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY) { if (pos >= limit) { send_sig(SIGXFSZ, current, 0); return -EFBIG; } *count = min(*count, limit - pos); } if (!(file->f_flags & O_LARGEFILE)) max_size = MAX_NON_LFS; if (unlikely(pos >= max_size)) return -EFBIG; *count = min(*count, max_size - pos); return 0; } EXPORT_SYMBOL_GPL(generic_write_check_limits); /* Like generic_write_checks(), but takes size of write instead of iter. */ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; if (IS_SWAPFILE(inode)) return -ETXTBSY; if (!*count) return 0; if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); if ((iocb->ki_flags & IOCB_NOWAIT) && !((iocb->ki_flags & IOCB_DIRECT) || (file->f_op->fop_flags & FOP_BUFFER_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); } EXPORT_SYMBOL(generic_write_checks_count); /* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. * Returns appropriate error code that caller should return or * zero in case that write should be allowed. */ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { loff_t count = iov_iter_count(from); int ret; ret = generic_write_checks_count(iocb, &count); if (ret) return ret; iov_iter_truncate(from, count); return iov_iter_count(from); } EXPORT_SYMBOL(generic_write_checks); /* * Performs common checks before doing a file copy/clone * from @file_in to @file_out. */ int generic_file_rw_checks(struct file *file_in, struct file *file_out) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); /* Don't copy dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || (file_out->f_flags & O_APPEND)) return -EBADF; return 0; } int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) { size_t len = iov_iter_count(iter); if (!iter_is_ubuf(iter)) return -EINVAL; if (!is_power_of_2(len)) return -EINVAL; if (!IS_ALIGNED(iocb->ki_pos, len)) return -EINVAL; if (!(iocb->ki_flags & IOCB_DIRECT)) return -EOPNOTSUPP; return 0; } EXPORT_SYMBOL_GPL(generic_atomic_write_valid);
14 2 2 2 2 1 81 81 81 81 80 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 // SPDX-License-Identifier: GPL-2.0-or-later /* * A generic kernel FIFO implementation * * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> */ #include <linux/dma-mapping.h> #include <linux/err.h> #include <linux/export.h> #include <linux/kfifo.h> #include <linux/log2.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/uaccess.h> /* * internal helper to calculate the unused elements in a fifo */ static inline unsigned int kfifo_unused(struct __kfifo *fifo) { return (fifo->mask + 1) - (fifo->in - fifo->out); } int __kfifo_alloc_node(struct __kfifo *fifo, unsigned int size, size_t esize, gfp_t gfp_mask, int node) { /* * round up to the next power of 2, since our 'let the indices * wrap' technique works only in this case. */ size = roundup_pow_of_two(size); fifo->in = 0; fifo->out = 0; fifo->esize = esize; if (size < 2) { fifo->data = NULL; fifo->mask = 0; return -EINVAL; } fifo->data = kmalloc_array_node(esize, size, gfp_mask, node); if (!fifo->data) { fifo->mask = 0; return -ENOMEM; } fifo->mask = size - 1; return 0; } EXPORT_SYMBOL(__kfifo_alloc_node); void __kfifo_free(struct __kfifo *fifo) { kfree(fifo->data); fifo->in = 0; fifo->out = 0; fifo->esize = 0; fifo->data = NULL; fifo->mask = 0; } EXPORT_SYMBOL(__kfifo_free); int __kfifo_init(struct __kfifo *fifo, void *buffer, unsigned int size, size_t esize) { size /= esize; if (!is_power_of_2(size)) size = rounddown_pow_of_two(size); fifo->in = 0; fifo->out = 0; fifo->esize = esize; fifo->data = buffer; if (size < 2) { fifo->mask = 0; return -EINVAL; } fifo->mask = size - 1; return 0; } EXPORT_SYMBOL(__kfifo_init); static void kfifo_copy_in(struct __kfifo *fifo, const void *src, unsigned int len, unsigned int off) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); memcpy(fifo->data + off, src, l); memcpy(fifo->data, src + l, len - l); /* * make sure that the data in the fifo is up to date before * incrementing the fifo->in index counter */ smp_wmb(); } unsigned int __kfifo_in(struct __kfifo *fifo, const void *buf, unsigned int len) { unsigned int l; l = kfifo_unused(fifo); if (len > l) len = l; kfifo_copy_in(fifo, buf, len, fifo->in); fifo->in += len; return len; } EXPORT_SYMBOL(__kfifo_in); static void kfifo_copy_out(struct __kfifo *fifo, void *dst, unsigned int len, unsigned int off) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); memcpy(dst, fifo->data + off, l); memcpy(dst + l, fifo->data, len - l); /* * make sure that the data is copied before * incrementing the fifo->out index counter */ smp_wmb(); } unsigned int __kfifo_out_peek(struct __kfifo *fifo, void *buf, unsigned int len) { unsigned int l; l = fifo->in - fifo->out; if (len > l) len = l; kfifo_copy_out(fifo, buf, len, fifo->out); return len; } EXPORT_SYMBOL(__kfifo_out_peek); unsigned int __kfifo_out_linear(struct __kfifo *fifo, unsigned int *tail, unsigned int n) { unsigned int size = fifo->mask + 1; unsigned int off = fifo->out & fifo->mask; if (tail) *tail = off; return min3(n, fifo->in - fifo->out, size - off); } EXPORT_SYMBOL(__kfifo_out_linear); unsigned int __kfifo_out(struct __kfifo *fifo, void *buf, unsigned int len) { len = __kfifo_out_peek(fifo, buf, len); fifo->out += len; return len; } EXPORT_SYMBOL(__kfifo_out); static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, const void __user *from, unsigned int len, unsigned int off, unsigned int *copied) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; unsigned long ret; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); ret = copy_from_user(fifo->data + off,